diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index fb1c2f2d06..f0884cb7d1 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit fb1c2f2d067f63cebe6799f582f81cb89157e291 +Subproject commit f0884cb7d1ecb12393ddae54622b2c384bb8e2a8 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index ac86fac686..e064b4bbfe 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -505,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f index fb942500a5..4ff41257c3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 0665bfb93b..cd73d52ed3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -591,38 +591,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -648,7 +660,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -665,6 +677,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 7de8886b1d..de07450c31 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -86,6 +86,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -110,6 +111,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index bd4803bc0d..4e7ff52f2b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -71,7 +71,7 @@ gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -95,7 +95,7 @@ // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -207,60 +207,61 @@ } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -268,32 +269,52 @@ break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%%d icol=%%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f index 1151dc5a6c..858052727f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f @@ -18,7 +18,7 @@ IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. c ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486) @@ -38,7 +38,7 @@ CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -46,7 +46,7 @@ STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index f8930a863f..d2dba08431 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0051648616790771484  +DEBUG: model prefixing takes 0.003051280975341797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -147,7 +150,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.002 s +1 processes with 2 diagrams generated in 0.003 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -158,10 +161,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -173,22 +176,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s -Wrote files for 8 helas calls in 0.285 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +Wrote files for 8 helas calls in 0.049 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.122 s +ALOHA: aloha creates 3 routines in 0.112 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.152 s +ALOHA: aloha creates 7 routines in 0.184 s FFV1 FFV1 FFV2 @@ -197,32 +200,32 @@ ALOHA: aloha creates 7 routines in 0.152 s FFV4 FFV2_4 FFV2_4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m4.542s -user 0m1.246s -sys 0m0.587s -Code generation completed in 5 seconds +real 0m1.899s +user 0m1.615s +sys 0m0.272s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -243,10 +246,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -273,10 +276,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index 7aed5df7db..b3ab00b31d 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 1c6406a546..fe8aef0c2c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -926,38 +926,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -983,7 +995,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1000,6 +1012,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1062,7 +1075,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1118,7 +1132,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1142,7 +1156,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1254,60 +1268,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1315,32 +1330,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 1469ba9333..b590074a0a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f index 78c4e66a95..1e083ecd15 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f index 03db576967..cf12adfab5 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f @@ -349,6 +349,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -359,6 +362,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -461,7 +465,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -534,19 +538,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -616,7 +622,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -640,7 +646,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -648,7 +654,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f index 15e4d1a8a2..fa57230d40 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f @@ -304,7 +304,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -342,8 +342,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index bdea67b952..d052f70ef1 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005957365036010742  +DEBUG: model prefixing takes 0.0037975311279296875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -147,13 +150,13 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.002 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -162,17 +165,17 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.002 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.171 s +ALOHA: aloha creates 4 routines in 0.146 s FFV1 FFV1 FFV2 @@ -181,17 +184,17 @@ ALOHA: aloha creates 4 routines in 0.171 s FFV4 FFV2_4 FFV2_4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.151s -user 0m0.372s -sys 0m0.155s -Code generation completed in 1 seconds +real 0m0.591s +user 0m0.503s +sys 0m0.085s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 22cb8c2604..ac578a326e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -924,38 +924,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -981,7 +993,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -998,6 +1010,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1060,7 +1073,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1116,7 +1130,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1140,7 +1154,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1252,60 +1266,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1313,32 +1328,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 1469ba9333..b590074a0a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index dbae24afe0..d3995ceb3f 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005540609359741211  +DEBUG: model prefixing takes 0.0037679672241210938  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.016 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +162,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_ INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -174,49 +177,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.266 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Wrote files for 10 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.089 s +ALOHA: aloha creates 2 routines in 0.142 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.093 s +ALOHA: aloha creates 4 routines in 0.154 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m4.687s -user 0m1.163s -sys 0m0.619s -Code generation completed in 5 seconds +real 0m2.703s +user 0m2.247s +sys 0m0.431s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -237,10 +240,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -267,10 +270,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 8b331b055f..38c1f98839 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 091fecf10e..db2341b9c3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1075,7 +1088,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1131,7 +1145,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1169,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1281,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1343,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3c5f6fe31f..b3c3d0ffb4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index 7f809ad0ff..6c6b37db2c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index a68aa6e4c0..7cf597b197 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index b47f79aa45..a6ff6ae67f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 20cc72fd46..8f9e676441 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00434565544128418  +DEBUG: model prefixing takes 0.003092527389526367  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.005 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -163,30 +166,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.091 s +ALOHA: aloha creates 2 routines in 0.104 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.992s -user 0m0.334s -sys 0m0.123s +real 0m0.513s +user 0m0.447s +sys 0m0.065s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 61e6f0c54c..aae153e603 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -936,38 +936,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -993,7 +1005,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1010,6 +1022,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1072,7 +1085,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1128,7 +1142,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1152,7 +1166,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1264,60 +1278,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1325,32 +1340,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 3c5f6fe31f..b3c3d0ffb4 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 332a0806f1..cc9c748aea 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005877494812011719  +DEBUG: model prefixing takes 0.003180265426635742  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -156,7 +159,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.010 s +1 processes with 16 diagrams generated in 0.016 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -167,10 +170,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -184,9 +187,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -195,25 +198,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s -Wrote files for 46 helas calls in 0.502 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.039 s +Wrote files for 46 helas calls in 0.179 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.190 s +ALOHA: aloha creates 5 routines in 0.226 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.187 s +ALOHA: aloha creates 10 routines in 0.205 s VVV1 VVV1 FFV1 @@ -223,32 +226,32 @@ ALOHA: aloha creates 10 routines in 0.187 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.233s -user 0m1.496s -sys 0m0.718s -Code generation completed in 5 seconds +real 0m3.146s +user 0m2.680s +sys 0m0.410s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -269,10 +272,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -299,10 +302,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index 30bd3794c3..de51114026 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 091fecf10e..db2341b9c3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1075,7 +1088,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1131,7 +1145,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1169,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1281,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1343,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3c5f6fe31f..b3c3d0ffb4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index 7f809ad0ff..6c6b37db2c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index a68aa6e4c0..7cf597b197 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f index b47f79aa45..a6ff6ae67f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index ce41e289c6..c920229c65 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -1156,38 +1156,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1213,7 +1225,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1230,6 +1242,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1292,7 +1305,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1348,7 +1362,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1372,7 +1386,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1484,60 +1498,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1545,32 +1560,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index 44f2636937..d248effd6c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f index ca0da2991e..4282896667 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f index a43968abf6..b29c6aeca2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f index 3ed3e82f91..41a39a13ca 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f @@ -302,7 +302,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -340,8 +340,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index b836987bc5..e40c4635b8 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00551295280456543  +DEBUG: model prefixing takes 0.00511932373046875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.012 s +1 processes with 16 diagrams generated in 0.017 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +162,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -174,25 +177,25 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.046 s -Wrote files for 36 helas calls in 0.368 s +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s +Wrote files for 36 helas calls in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.190 s +ALOHA: aloha creates 5 routines in 0.356 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.194 s +ALOHA: aloha creates 10 routines in 0.189 s VVV1 VVV1 FFV1 @@ -202,32 +205,32 @@ ALOHA: aloha creates 10 routines in 0.194 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m4.945s -user 0m1.513s -sys 0m0.678s -Code generation completed in 5 seconds +real 0m2.687s +user 0m2.344s +sys 0m0.324s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -248,10 +251,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -278,10 +281,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index 0fe3df08d4..444f1253f8 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 0726e0a6ea..e93eba2447 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1156,38 +1156,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1213,7 +1225,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1230,6 +1242,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1292,7 +1305,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1348,7 +1362,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1372,7 +1386,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1484,60 +1498,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1545,32 +1560,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 5c057176f6..6ad3c7dd1e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index ebf5273614..6dfa640d9e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index c32cb4d43c..77820f0e51 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 6724cffa4b..7388a4bf7e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -302,7 +302,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -340,8 +340,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ba99f30bdf..b6b3fce0e4 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005433082580566406  +DEBUG: model prefixing takes 0.005330085754394531  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.012 s +1 processes with 16 diagrams generated in 0.023 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -163,18 +166,18 @@ INFO: Processing color information for process: g g > t t~ g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.045 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.063 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.206 s +ALOHA: aloha creates 5 routines in 0.189 s VVV1 VVV1 FFV1 @@ -184,17 +187,17 @@ ALOHA: aloha creates 5 routines in 0.206 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m1.176s -user 0m0.468s -sys 0m0.131s +real 0m1.071s +user 0m0.866s +sys 0m0.192s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 372ced5d87..d1b99f37c2 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -1150,38 +1150,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1207,7 +1219,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1224,6 +1236,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1286,7 +1299,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1342,7 +1356,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1366,7 +1380,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1478,60 +1492,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1539,32 +1554,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 5c057176f6..6ad3c7dd1e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index ea9db152a3..7d81448797 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004921674728393555  +DEBUG: model prefixing takes 0.0037353038787841797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.080 s +1 processes with 123 diagrams generated in 0.107 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +162,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -174,25 +177,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.223 s -Wrote files for 222 helas calls in 0.654 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.295 s +Wrote files for 222 helas calls in 0.444 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.219 s +ALOHA: aloha creates 5 routines in 0.185 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.197 s +ALOHA: aloha creates 10 routines in 0.236 s VVV1 VVV1 FFV1 @@ -205,32 +208,32 @@ ALOHA: aloha creates 10 routines in 0.197 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m5.675s -user 0m2.118s -sys 0m0.681s -Code generation completed in 6 seconds +real 0m3.446s +user 0m3.048s +sys 0m0.333s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -251,10 +254,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -281,10 +284,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index 5fe0cb01be..5ffde659c4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 0f6ddcae67..e90ff2af99 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -3084,38 +3084,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -3141,7 +3153,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -3158,6 +3170,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3220,7 +3233,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3276,7 +3290,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3300,7 +3314,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -3412,60 +3426,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -3473,32 +3488,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 96f4a4724c..6ef3863ae3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f index 6a61beea31..60ddbdbedd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index 0f7fcaa25f..3a363b9a2b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f index b173f22bfc..19b83eff7d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -372,8 +372,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 7ff994126b..b8b747dcaf 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003983020782470703  +DEBUG: model prefixing takes 0.00394439697265625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.081 s +1 processes with 123 diagrams generated in 0.135 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -163,18 +166,18 @@ INFO: Processing color information for process: g g > t t~ g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.216 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.297 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.204 s +ALOHA: aloha creates 5 routines in 0.227 s VVV1 VVV1 FFV1 @@ -187,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.204 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.544s -user 0m0.774s -sys 0m0.144s -Code generation completed in 2 seconds +real 0m1.193s +user 0m1.092s +sys 0m0.096s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 08a537c1f2..31b2a909f7 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -3141,38 +3141,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -3198,7 +3210,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -3215,6 +3227,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3277,7 +3290,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3333,7 +3347,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3357,7 +3371,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -3469,60 +3483,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -3530,32 +3545,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 96f4a4724c..6ef3863ae3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index ebb525b6f1..11081fe610 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0036034584045410156  +DEBUG: model prefixing takes 0.003200531005859375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.963 s +1 processes with 1240 diagrams generated in 1.330 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,16 +162,16 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 6s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -176,25 +179,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 3.355 s -Wrote files for 2281 helas calls in 9.598 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.833 s +Wrote files for 2281 helas calls in 12.146 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.194 s +ALOHA: aloha creates 5 routines in 0.251 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.231 s +ALOHA: aloha creates 10 routines in 0.180 s VVV1 VVV1 FFV1 @@ -207,32 +210,32 @@ ALOHA: aloha creates 10 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m20.546s -user 0m16.458s -sys 0m0.884s -Code generation completed in 20 seconds +real 0m23.417s +user 0m22.882s +sys 0m0.409s +Code generation completed in 23 seconds ************************************************************ * * * W E L C O M E to * @@ -253,10 +256,10 @@ Code generation completed in 20 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -283,10 +286,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index 08a07273bc..e1b2f3835b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 148ad48435..c6619a0fa0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30655,38 +30655,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -30712,7 +30724,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -30729,6 +30741,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -30791,7 +30804,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -30847,7 +30861,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -30871,7 +30885,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -30983,60 +30997,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -31044,32 +31059,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 75c52ba31a..fbe1065f6b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f index 95f2b50e68..7a436cbd5c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f index 14d6ca8aa6..0c9e5f7080 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=128) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f index 870c890410..5ae4792dfa 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f index ff1a367151..e11cef7ff9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f @@ -398,7 +398,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -436,8 +436,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 66cd67a19b..14ddd64b84 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0029516220092773438  +DEBUG: model prefixing takes 0.004296302795410156  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.953 s +1 processes with 1240 diagrams generated in 1.487 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 @@ -163,18 +166,18 @@ INFO: Processing color information for process: g g > t t~ g g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 3.379 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.995 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.214 s +ALOHA: aloha creates 5 routines in 0.180 s VVV1 VVV1 FFV1 @@ -187,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.214 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m7.419s -user 0m6.626s -sys 0m0.185s -Code generation completed in 7 seconds +real 0m9.308s +user 0m9.042s +sys 0m0.122s +Code generation completed in 10 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 6b89d18559..7de7d71e25 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -32545,38 +32545,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -32602,7 +32614,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -32619,6 +32631,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -32681,7 +32694,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -32737,7 +32751,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -32761,7 +32775,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -32873,60 +32887,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -32934,32 +32949,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 75c52ba31a..fbe1065f6b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f index 870c890410..5ae4792dfa 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 248fa16d65..780ea50154 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0052111148834228516  +DEBUG: model prefixing takes 0.004786014556884766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +166,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.041 s +8 processes with 40 diagrams generated in 0.052 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -174,10 +177,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -197,9 +200,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -208,50 +211,50 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.017 s -Wrote files for 32 helas calls in 0.625 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.022 s +Wrote files for 32 helas calls in 0.114 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.094 s +ALOHA: aloha creates 2 routines in 0.095 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.080 s +ALOHA: aloha creates 4 routines in 0.074 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m5.076s -user 0m1.391s -sys 0m0.672s -Code generation completed in 5 seconds +real 0m2.221s +user 0m1.894s +sys 0m0.308s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -272,10 +275,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -302,10 +305,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index aba2f10b06..7ba8666046 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 787b72a15b..10bc00bbe1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1130,7 +1143,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1186,7 +1200,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1224,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1336,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1398,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index ebc491b00d..ab9d7dde82 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index 4595d5a38e..37932e73a3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 0f523f574b..f252c024f6 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f index 90ac031008..14fd0f0017 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -361,8 +361,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index e2c28c73eb..6cdb911a34 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1130,7 +1143,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1186,7 +1200,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1224,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1336,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1398,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 2c3a739550..55c42cb947 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e239a05794..748758b702 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 7240e416ab..bb34349714 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f index aa0f9bedff..784c7b3ebc 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -361,8 +361,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index e76b814911..e099e064a7 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005757570266723633  +DEBUG: model prefixing takes 0.0030565261840820312  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.040 s +8 processes with 40 diagrams generated in 0.056 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -186,40 +189,40 @@ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  DEBUG: type(subproc_group)= [output.py at line 223]  DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=1 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.016 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.025 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.090 s +ALOHA: aloha creates 2 routines in 0.103 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.337s -user 0m0.375s -sys 0m0.160s -Code generation completed in 2 seconds +real 0m0.601s +user 0m0.523s +sys 0m0.077s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index eea3950214..450397eebc 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -989,38 +989,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1046,7 +1058,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1063,6 +1075,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1125,7 +1138,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1181,7 +1195,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1205,7 +1219,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1317,60 +1331,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1378,32 +1393,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index ebc491b00d..ab9d7dde82 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index bb8b2f2773..dea02d2ce8 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -989,38 +989,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1046,7 +1058,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1063,6 +1075,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1125,7 +1138,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1181,7 +1195,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1205,7 +1219,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1317,60 +1331,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1378,32 +1393,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index 2c3a739550..55c42cb947 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index f374f8f313..28372a7d42 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,22 +39,24 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -120,7 +123,7 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.003 s +1 processes with 4 diagrams generated in 0.004 s Total: 1 processes with 4 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -131,10 +134,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -146,55 +149,55 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s -Wrote files for 12 helas calls in 0.268 s +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s +Wrote files for 12 helas calls in 0.062 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.200 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.152 s +ALOHA: aloha creates 8 routines in 0.174 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m4.654s -user 0m1.223s -sys 0m0.605s -Code generation completed in 5 seconds +real 0m2.283s +user 0m1.905s +sys 0m0.318s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -215,10 +218,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -245,10 +248,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat index 84c16b4cf4..5c112346ee 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index c32c974cc1..b94c1c9d64 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -953,38 +953,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1010,7 +1022,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1027,6 +1039,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1089,7 +1102,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1145,7 +1159,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1169,7 +1183,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1281,60 +1295,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1342,32 +1357,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h index 543e74fad7..8e08d92d87 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f index 785453cfcf..3130417167 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f index fc8effb6b2..6346c8cc25 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f index 66966ada1a..6fb79f6e5d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index e04a2da479..11221ddda0 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,71 +39,25 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models  ---2026-03-10 10:38:21-- https://madgraph.mi.infn.it/Downloads/models/heft.tgz -Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75 -Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: 50876 (50K) [application/x-gzip] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... ......... 100% 2.92M=0.02s - -2026-03-10 10:38:22 (2.92 MB/s) - ‘tmp.tgz’ saved [50876/50876] - -heft/ -heft/write_param_card.py -heft/restrict_ckm.dat -heft/couplings.py -heft/HEFT_UFO.log -heft/lorentz.py -heft/__init__.py -heft/__pycache__/ -heft/particles.py -heft/object_library.py -heft/restrict_default.dat -heft/restrict_zeromass_ckm.dat -heft/restrict_no_b_mass.dat -heft/function_library.py -heft/parameters.py -heft/py3_model.pkl -heft/coupling_orders.py -heft/restrict_no_tau_mass.dat -heft/vertices.py -heft/restrict_no_masses.dat -heft/__pycache__/write_param_card.cpython-311.pyc -heft/__pycache__/parameters.cpython-311.pyc -heft/__pycache__/function_library.cpython-311.pyc -heft/__pycache__/coupling_orders.cpython-311.pyc -heft/__pycache__/object_library.cpython-311.pyc -heft/__pycache__/couplings.cpython-311.pyc -heft/__pycache__/particles.cpython-311.pyc -heft/__pycache__/vertices.cpython-311.pyc -heft/__pycache__/lorentz.cpython-311.pyc -heft/__pycache__/__init__.cpython-311.pyc -INFO: reload from .py file -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.007684946060180664  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -168,13 +123,13 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.003 s +1 processes with 4 diagrams generated in 0.006 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -183,34 +138,34 @@ INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.010 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.206 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m1.669s -user 0m0.522s -sys 0m0.180s -Code generation completed in 2 seconds +real 0m0.767s +user 0m0.655s +sys 0m0.105s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index 7a1f85c7cc..b2f5d2fe67 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -949,38 +949,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1006,7 +1018,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1023,6 +1035,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1085,7 +1098,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1141,7 +1155,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1165,7 +1179,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1277,60 +1291,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1338,32 +1353,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h index 543e74fad7..8e08d92d87 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index 5067c06ff1..41e608e60a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003014802932739258  +DEBUG: model prefixing takes 0.002887248992919922  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.056 s +4 processes with 8 diagrams generated in 0.067 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -220,7 +223,7 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.331 s +12 processes with 144 diagrams generated in 0.454 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -231,10 +234,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --v INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -268,9 +271,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -279,9 +282,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -290,9 +293,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -301,9 +304,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -312,9 +315,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -323,9 +326,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -334,9 +337,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -345,21 +348,21 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.104 s -Wrote files for 212 helas calls in 2.138 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.127 s +Wrote files for 212 helas calls in 0.633 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.123 s +ALOHA: aloha creates 3 routines in 0.142 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.122 s +ALOHA: aloha creates 6 routines in 0.121 s FFV1 FFV1 FFV1 @@ -367,32 +370,32 @@ ALOHA: aloha creates 6 routines in 0.122 s FFV2 FFV2 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m8.122s -user 0m2.522s -sys 0m1.075s -Code generation completed in 8 seconds +real 0m4.417s +user 0m3.753s +sys 0m0.584s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -413,10 +416,10 @@ Code generation completed in 8 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -443,10 +446,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat index 3f652ded8d..981120a965 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index 9d43997b76..c2e9d2379e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -966,38 +966,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1023,7 +1035,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1040,6 +1052,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1102,7 +1115,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1158,7 +1172,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1182,7 +1196,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1294,60 +1308,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1355,32 +1370,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h index 53f417c646..56d598b7a9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f index 16d9b1bce8..1ed73392bb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f index 983025466d..959af9abb8 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=48) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f index 97ed635786..8ea7cbd981 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -373,8 +373,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index 83d25c8021..e008b431dc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -966,38 +966,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1023,7 +1035,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1040,6 +1052,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1102,7 +1115,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1158,7 +1172,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1182,7 +1196,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1294,60 +1308,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1355,32 +1370,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h index 3ac92dd2c9..913c0dfeed 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f index 37f83693d3..f4a45af3d4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f index 2224f52ad1..d2d45ddbaa 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=48) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f index 1496eebe35..72232f43be 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -373,8 +373,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 152beb1322..693440b711 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1298,7 +1311,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1354,7 +1368,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1392,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1504,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1566,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h index 20f8a6d2b4..1335e38061 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f index af77031e76..c9599a9732 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f index a566870b6b..5681d8535f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f index 0f5afbd521..bf4575a44d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index 8f0bfc615c..9fc599ac21 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1298,7 +1311,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1354,7 +1368,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1392,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1504,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1566,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h index e94d034748..27f4d1c5c2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f index 633c2bda2a..461b2a1f4c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f index 7fda166f5a..f72ed6255e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f index 8d05da36d4..60736d40f4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index 209e073d74..a2b839f183 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1298,7 +1311,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1354,7 +1368,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1392,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1504,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1566,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h index a83896951d..b11b67d795 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f index df3b5e689b..345754eb7e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f index 5a48f895c3..0caf0301e3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f index cb4090e743..8befd86e93 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index f63f49b5fd..0616b6b85d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1298,7 +1311,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1354,7 +1368,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1392,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1504,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1566,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h index eadff47f18..96aee249a6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f index 8a448d0444..ab11a90fc1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f index e2759d19f6..b5b15b7c5c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f index bf1d47c73c..1d55f3f5b6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index b97e46ece1..ae53d88775 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1298,7 +1311,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1354,7 +1368,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1392,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1504,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1566,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h index 1642721bee..c3531e18ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f index a0091febb6..57090d058f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f index 92e84c1147..b2737ec3fa 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f index e194b5f639..d475dc7829 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index b6bdeb9a02..41fef341d6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1298,7 +1311,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1354,7 +1368,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1392,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1504,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1566,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h index 3e7ccff73e..8cfc26cf49 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f index 369bf6cdf6..3b09bc2ba0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f index 75c9ced543..41ca9266c1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f index 164ddfda7d..795d4cc364 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index a8e3a6d67a..db7adae003 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004999399185180664  +DEBUG: model prefixing takes 0.0032334327697753906  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.015 s +5 processes with 7 diagrams generated in 0.019 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -205,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.070 s +13 processes with 76 diagrams generated in 0.083 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -371,7 +374,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 0.941 s +65 processes with 1119 diagrams generated in 1.244 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -382,10 +385,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vec INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -496,9 +499,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -507,9 +510,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -518,9 +521,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -529,9 +532,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -540,9 +543,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -551,9 +554,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -562,9 +565,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -573,9 +576,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -584,9 +587,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -595,9 +598,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -606,9 +609,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -617,9 +620,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -628,9 +631,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -639,9 +642,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -650,9 +653,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -661,9 +664,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -672,9 +675,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -683,25 +686,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1748]  -Generated helas calls for 18 subprocesses (372 diagrams) in 0.671 s -Wrote files for 810 helas calls in 5.590 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1749]  +Generated helas calls for 18 subprocesses (372 diagrams) in 0.763 s +Wrote files for 810 helas calls in 1.804 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.216 s +ALOHA: aloha creates 5 routines in 0.175 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.194 s +ALOHA: aloha creates 10 routines in 0.214 s VVV1 VVV1 FFV1 @@ -714,32 +717,32 @@ ALOHA: aloha creates 10 routines in 0.194 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m15.089s -user 0m5.988s -sys 0m1.827s -Code generation completed in 16 seconds +real 0m8.708s +user 0m7.758s +sys 0m0.875s +Code generation completed in 9 seconds ************************************************************ * * * W E L C O M E to * @@ -760,10 +763,10 @@ Code generation completed in 16 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -790,10 +793,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index fa1bcf88f4..86d647aa4d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 8b330d85d5..61984fa1ed 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1075,7 +1088,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1131,7 +1145,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1169,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1281,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1343,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 1aaf72997b..f67a329ee7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f index e5f47166fb..690f872c32 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f index 0d129ab296..a582ad2fd4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f index 2d0cc3a394..f01adf2ab1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index bd9ec082ce..205b4adbb9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -916,38 +916,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -973,7 +985,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -990,6 +1002,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1052,7 +1065,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1108,7 +1122,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1132,7 +1146,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1244,60 +1258,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1305,32 +1320,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index a96df4e864..e822c4f778 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f index ae9439cf9e..b34048e29d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f index c155307e43..0600c671ce 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f index ccb869545a..1e6c927bfb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f @@ -289,7 +289,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -330,8 +330,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 0726e0a6ea..e93eba2447 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1156,38 +1156,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1213,7 +1225,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1230,6 +1242,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1292,7 +1305,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1348,7 +1362,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1372,7 +1386,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1484,60 +1498,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1545,32 +1560,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 5c057176f6..6ad3c7dd1e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index ebf5273614..6dfa640d9e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index c32cb4d43c..77820f0e51 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 6724cffa4b..7388a4bf7e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -302,7 +302,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -340,8 +340,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 92c74d5c62..4a55d8aa90 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1130,7 +1143,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1186,7 +1200,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1224,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1336,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1398,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index ebc491b00d..ab9d7dde82 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index 4595d5a38e..37932e73a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 0f523f574b..f252c024f6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f index a06e72a3c3..613dbb7f66 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -305,7 +305,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -346,8 +346,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 77d9edb7b2..4a48ab0895 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1130,7 +1143,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1186,7 +1200,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1224,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1336,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1398,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 2c3a739550..55c42cb947 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e239a05794..748758b702 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 7240e416ab..bb34349714 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f index a162af362e..22a6b8c5b9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -305,7 +305,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -346,8 +346,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 888768ef3b..64df152d47 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1130,7 +1143,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1186,7 +1200,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1224,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1336,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1398,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index 01180e3e92..f776ee3de7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f index b15c35131c..01735be0d3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f index 95e3e81bc6..632d791617 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f index 16e908ba11..0ec6e93020 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f @@ -305,7 +305,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -346,8 +346,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index f1617232e3..c54a3fe49b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -3084,38 +3084,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -3141,7 +3153,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -3158,6 +3170,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3220,7 +3233,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3276,7 +3290,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3300,7 +3314,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -3412,60 +3426,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -3473,32 +3488,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 363ab0b79d..f51b7656c4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f index 1108637c49..e7f590a087 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f index 02c9412706..4705c638be 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f index 51476eb7fa..b4b9172028 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -372,8 +372,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 7e011c2c62..c1f4c28e6c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1627,7 +1640,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1683,7 +1697,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1721,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1833,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1895,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index eb46a03db6..1f9c0ec433 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f index 0f260565e3..5456c9a1d1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f index acc21004ae..263e3be1b2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f @@ -352,6 +352,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -362,6 +365,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -476,7 +480,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -549,19 +553,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -631,7 +637,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -655,7 +661,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -663,7 +669,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f index d46d392b1f..3816770328 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 20e3623198..20bc65a936 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1627,7 +1640,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1683,7 +1697,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1721,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1833,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1895,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 516900ab3b..916fafcf3e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f index 0ae010df69..3edd289da8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f index 2ed82fafaa..16d795c6a6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f index ea575a9bc3..c10cd1e6e8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 1ba94ad37f..45ea84b687 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1627,7 +1640,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1683,7 +1697,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1721,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1833,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1895,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index bcc9e9d736..067e81bad8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f index 236f6d16a9..c858b2c684 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f index dcf20fe396..1a6277d156 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f index a780b1f4fa..8fdfbc4513 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index 7665fa9af8..934030a5b4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -1072,38 +1072,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1129,7 +1141,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1146,6 +1158,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1208,7 +1221,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1264,7 +1278,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1288,7 +1302,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1400,60 +1414,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1461,32 +1476,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 553048dc11..650bd18517 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -192,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f index 956dc07485..37ff46da63 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f @@ -796,8 +796,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -898,9 +897,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1210,7 +1208,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1225,10 +1223,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1240,7 +1241,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f index 9bc73e492f..284cc76158 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f @@ -368,6 +368,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -380,6 +383,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -514,7 +518,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -587,19 +591,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -669,7 +675,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -693,7 +699,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -701,7 +707,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f index 559059580c..572c6ced56 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f @@ -339,7 +339,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -382,8 +382,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index a7fde33970..7ce718c14d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -1078,38 +1078,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1135,7 +1147,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1152,6 +1164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1214,7 +1227,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1270,7 +1284,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1294,7 +1308,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1406,60 +1420,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1467,32 +1482,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index b187f2ebf3..0d2dd21169 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -198,6 +199,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f index 9c2c20435d..bfc32bbd25 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f @@ -802,8 +802,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -904,9 +903,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1216,7 +1214,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1231,10 +1229,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1246,7 +1247,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f index bef5d7dd9f..33cf19f705 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f @@ -400,6 +400,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -412,6 +415,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -586,7 +590,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -659,19 +663,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -741,7 +747,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -765,7 +771,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -773,7 +779,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f index 56a2755163..65520b0758 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f @@ -345,7 +345,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -394,8 +394,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index a299144ca6..409028ebd7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -1182,38 +1182,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1239,7 +1251,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1256,6 +1268,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1318,7 +1331,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1374,7 +1388,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1398,7 +1412,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1510,60 +1524,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1571,32 +1586,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 98e755a489..70826b49e7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f index bed31f9d2f..39a81a621a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f index 9c2eb40089..d76f34423b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f index 8d7c00bfcd..0218e57040 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index edaf7372cc..60d176890e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -1078,38 +1078,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1135,7 +1147,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1152,6 +1164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1214,7 +1227,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1270,7 +1284,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1294,7 +1308,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1406,60 +1420,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1467,32 +1482,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 0c551f2f4d..3c73ffcdae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -198,6 +199,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f index 48de6ee6aa..fb7d3f331a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f @@ -802,8 +802,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -904,9 +903,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1216,7 +1214,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1231,10 +1229,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1246,7 +1247,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f index 018c1a985b..19450cafaf 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f @@ -400,6 +400,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -412,6 +415,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -586,7 +590,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -659,19 +663,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -741,7 +747,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -765,7 +771,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -773,7 +779,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f index 440f838b87..2c2555366f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f @@ -345,7 +345,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -394,8 +394,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 57a20afa9c..aa819764d6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1627,7 +1640,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1683,7 +1697,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1721,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1833,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1895,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 3290858ea0..977c1f0143 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f index 1b37ae6930..0441af0818 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f index e72dc0ca8c..cdb3a6377b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f index bc51e47c27..ae3e89ba33 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 4a0583759f..01837a3ec0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -1182,38 +1182,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1239,7 +1251,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1256,6 +1268,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1318,7 +1331,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1374,7 +1388,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1398,7 +1412,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1510,60 +1524,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1571,32 +1586,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 880e2dace8..49758d2918 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f index d51e86247a..016741f374 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f index c8106d783a..0a2a87d5d5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f index ae0a828447..3dcb0ae4ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 8e34c58b00..b24aceadbe 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -1072,38 +1072,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1129,7 +1141,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1146,6 +1158,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1208,7 +1221,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1264,7 +1278,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1288,7 +1302,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1400,60 +1414,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1461,32 +1476,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 314d5b2955..6e7d0b1d10 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -192,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f index 8991a26bd9..f93d884900 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f @@ -796,8 +796,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -898,9 +897,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1210,7 +1208,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1225,10 +1223,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1240,7 +1241,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f index 5e6645a738..b507ecd05b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f @@ -368,6 +368,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -380,6 +383,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -514,7 +518,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -587,19 +591,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -669,7 +675,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -693,7 +699,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -701,7 +707,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f index ef2d0fcb85..0eba207bd8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f @@ -339,7 +339,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -382,8 +382,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index b6b3dab286..3a786dd36f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -1182,38 +1182,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1239,7 +1251,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1256,6 +1268,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1318,7 +1331,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1374,7 +1388,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1398,7 +1412,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1510,60 +1524,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1571,32 +1586,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 89c57825a9..6e4939c539 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f index 8d5a646679..e28b5f2e76 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f index 7d08f78919..f01a8215a1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f index 85463860ad..c8a51154a0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 9a1af87664..ef5b9e073b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,22 +39,24 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -70,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.06830000877380371  +DEBUG: model prefixing takes 0.0440211296081543  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -85,7 +88,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.021 s +1 processes with 72 diagrams generated in 2.480 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -96,10 +99,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False - INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -111,25 +114,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.097 s -Wrote files for 119 helas calls in 0.474 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.102 s +Wrote files for 119 helas calls in 0.241 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.204 s +ALOHA: aloha creates 5 routines in 0.198 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.193 s +ALOHA: aloha creates 10 routines in 0.195 s VVV5 VVV5 FFV1 @@ -139,32 +142,32 @@ ALOHA: aloha creates 10 routines in 0.193 s VVVV1 VVVV9 VVVV10 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m7.520s -user 0m3.917s -sys 0m0.620s -Code generation completed in 8 seconds +real 0m5.280s +user 0m4.967s +sys 0m0.291s +Code generation completed in 5 seconds ************************************************************ * * * W E L C O M E to * @@ -185,10 +188,10 @@ Code generation completed in 8 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -215,10 +218,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat index 5e08560167..9f01c208e8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 0d7fe2e5ae..66a4e061f9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -2017,38 +2017,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -2074,7 +2086,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -2091,6 +2103,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2153,7 +2166,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -2209,7 +2223,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2233,7 +2247,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -2345,60 +2359,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -2406,32 +2421,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h index 87d1743da6..aa52499cf0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f index e9f856aa23..b68b2dd12c 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f index 7f0900eb3e..731770fcdf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f index 0f8b03e464..841be0ffef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -372,8 +372,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index c1a6a8c137..8fd1ffeee9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,57 +39,24 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models  ---2026-03-10 10:39:42-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz -Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 -Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. -HTTP request sent, awaiting response... 200 Ok -Length: 80562 (79K) [application/x-tar] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... .......... 63% 832K 0s - 50K .......... .......... ........ 100% 70.5M=0.06s - -2026-03-10 10:39:43 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] - -SMEFTsim_topU3l_MwScheme_UFO/ -SMEFTsim_topU3l_MwScheme_UFO/__init__.py -SMEFTsim_topU3l_MwScheme_UFO/param_card_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/CT_couplings.py -SMEFTsim_topU3l_MwScheme_UFO/particles.py -SMEFTsim_topU3l_MwScheme_UFO/write_param_card.py -SMEFTsim_topU3l_MwScheme_UFO/decays.py -SMEFTsim_topU3l_MwScheme_UFO/parameters.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/object_library.py -SMEFTsim_topU3l_MwScheme_UFO/coupling_orders.py -SMEFTsim_topU3l_MwScheme_UFO/version.info -SMEFTsim_topU3l_MwScheme_UFO/function_library.py -SMEFTsim_topU3l_MwScheme_UFO/couplings.py -SMEFTsim_topU3l_MwScheme_UFO/propagators.py -SMEFTsim_topU3l_MwScheme_UFO/lorentz.py -SMEFTsim_topU3l_MwScheme_UFO/vertices.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat -fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO -retry the load of the model +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -105,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.06466126441955566  +DEBUG: model prefixing takes 0.1009819507598877  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -114,22 +82,19 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ -INFO: Change particles name to pass to MG5 convention -Kept definitions of multiparticles p / j / l+ / l- / vl / vl~ unchanged -Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ generate g g > t t~ t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.072 s +1 processes with 72 diagrams generated in 2.979 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -138,18 +103,18 @@ INFO: Processing color information for process: g g > t t~ t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.094 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.194 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.194 s +ALOHA: aloha creates 5 routines in 0.237 s VVV5 VVV5 FFV1 @@ -159,17 +124,17 @@ ALOHA: aloha creates 5 routines in 0.194 s VVVV1 VVVV9 VVVV10 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.177s -user 0m2.874s -sys 0m0.228s -Code generation completed in 4 seconds +real 0m4.588s +user 0m4.372s +sys 0m0.143s +Code generation completed in 5 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index dc1d2ecd53..b64bd4553e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -1965,38 +1965,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -2022,7 +2034,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -2039,6 +2051,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2101,7 +2114,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -2157,7 +2171,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2181,7 +2195,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -2293,60 +2307,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -2354,32 +2369,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h index 87d1743da6..aa52499cf0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index e0e58acbf4..f54c66bc71 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,7 +550,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.055 s +1 processes with 6 diagrams generated in 0.099 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -558,10 +561,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False -- INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -573,52 +576,52 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s -Wrote files for 16 helas calls in 0.279 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s +Wrote files for 16 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.114 s +ALOHA: aloha creates 3 routines in 0.223 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.120 s +ALOHA: aloha creates 6 routines in 0.113 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m5.502s -user 0m1.722s -sys 0m0.643s -Code generation completed in 6 seconds +real 0m3.252s +user 0m2.829s +sys 0m0.336s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -639,10 +642,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -669,10 +672,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat index ee7d1277ff..f07e5631fd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 7aef93970a..4573f2ffb8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -962,38 +962,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1019,7 +1031,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1036,6 +1048,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1098,7 +1111,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1154,7 +1168,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1178,7 +1192,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1290,60 +1304,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1351,32 +1366,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h index 293c26a2e9..f5d3042d1a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f index c8bb469792..fe7a4274ea 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f index bdf00312dc..5e894db7e8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=4) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f index c5dcf87c06..2c3622336c 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f @@ -274,7 +274,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -312,8 +312,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0ee162c616..2955217a8a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,13 +550,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.055 s +1 processes with 6 diagrams generated in 0.101 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -562,32 +565,32 @@ INFO: Processing color information for process: g g > t1 t1~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.113 s +ALOHA: aloha creates 3 routines in 0.158 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.441s -user 0m0.724s -sys 0m0.134s +real 0m1.233s +user 0m1.121s +sys 0m0.102s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index c5cac709d7..7b46d3ea33 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -963,38 +963,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1020,7 +1032,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1037,6 +1049,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1099,7 +1112,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1155,7 +1169,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1179,7 +1193,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1291,60 +1305,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1352,32 +1367,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h index 293c26a2e9..f5d3042d1a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 88e01c7e57..8772a13e3d 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +39,17 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,7 +550,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.052 s +1 processes with 3 diagrams generated in 0.072 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -558,10 +561,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -573,49 +576,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.273 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +Wrote files for 10 helas calls in 0.053 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.077 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.079 s +ALOHA: aloha creates 4 routines in 0.092 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m5.086s -user 0m1.635s -sys 0m0.704s -Code generation completed in 5 seconds +real 0m2.544s +user 0m2.202s +sys 0m0.314s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -636,10 +639,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -666,10 +669,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat index 3a6928f635..3c7c799a87 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index b575475690..9ef41eadef 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1075,7 +1088,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1131,7 +1145,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1169,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1281,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1343,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 732f9919c9..58e1bfe668 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index 7f809ad0ff..6c6b37db2c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index a68aa6e4c0..7cf597b197 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index ef4145fa88..52a516cda9 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 7142d5e27a..5ef40ad902 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -1,4 +1,4 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  +WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +29,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,23 +39,22 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.4310164451599121  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -550,13 +550,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.054 s +1 processes with 3 diagrams generated in 0.103 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -565,30 +565,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.082 s +ALOHA: aloha creates 2 routines in 0.090 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.103s -user 0m1.223s -sys 0m0.178s -Code generation completed in 2 seconds +real 0m1.091s +user 0m0.998s +sys 0m0.091s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..e064b4bbfe 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -469,7 +469,8 @@ namespace mg5amcGpu m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + m_pHelJamps.reset( new DeviceBufferSimple( static_cast( nGoodHel ) * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) @@ -504,7 +505,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index e0e3bfd321..59e9d29536 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -936,38 +936,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -993,7 +1005,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1010,6 +1022,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1072,7 +1085,8 @@ namespace mg5amcCpu // *** PART 0a - CUDA *** const int nevt = gpublocks * gputhreads; gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); - gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); + // all function parameter are type int, the first is casted to size_t to avoid integer overflow if the product is greater than max(int) + gpuMemset( ghelAllJamps, 0, static_cast( cNGoodHel ) * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1128,7 +1142,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1152,7 +1166,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1264,60 +1278,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1325,32 +1340,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 732f9919c9..58e1bfe668 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h index 9e942d3edc..9ec84c36a8 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h @@ -28,9 +28,9 @@ namespace mg5amcCpu static __device__ inline cxtype_ref kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) @@ -43,9 +43,9 @@ namespace mg5amcCpu static __device__ inline const cxtype kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) { - const int ncolor = CPPProcess::ncolor; // the number of leading colors - const int nevt = gridDim.x * blockDim.x; - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + const size_t ncolor = CPPProcess::ncolor; // the number of leading colors + const size_t nevt = gridDim.x * blockDim.x; + const size_t ievt = blockDim.x * blockIdx.x + threadIdx.x; // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..2d90fafa6a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' > $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"