Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ noinst_HEADERS = \
rdma/gin/nccl_ofi_gin_reqs.h \
rdma/gin/nccl_ofi_gin_resources.h \
rdma/gin/nccl_ofi_gin_types.h \
rdma/gin/nccl_ofi_gin_gdaki.h \
nccl_ofi_gin_base.h \
nccl_ofi.h \
nccl_ofi_api.h \
Expand Down
6 changes: 6 additions & 0 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,4 +374,10 @@ OFI_NCCL_PARAM(std::string, platform, "PLATFORM", "");
OFI_NCCL_PARAM_VALUE_SET(NVTX_TRACE_DIMENSION, (PER_COMM)(PER_DEV))
OFI_NCCL_PARAM(NVTX_TRACE_DIMENSION, nvtx_trace_dimension, "NVTX_TRACE_DIMENSION", NVTX_TRACE_DIMENSION::PER_COMM)

/*
* Enable GDAKI (GPUDirect Async) mode for the GIN plugin.
* Not yet supported — stub implementation only.
*/
OFI_NCCL_PARAM(bool, gin_gdaki, "GIN_GDAKI", false);

#endif // End NCCL_OFI_PARAM_H_
21 changes: 21 additions & 0 deletions include/rdma/gin/nccl_ofi_gin_gdaki.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Copyright (c) 2026 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#ifndef NCCL_OFI_GIN_GDAKI_H_
#define NCCL_OFI_GIN_GDAKI_H_

#include "nccl_ofi.h"

/*
* Return true if GDAKI mode is requested via OFI_NCCL_GIN_GDAKI=1 env var.
*/
bool nccl_ofi_gin_gdaki_enabled();

/*
* The GDAKI plugin. Shared functions (init, devices, listen, connect)
* are nullptr and get copied from the proxy plugin at init time.
*/
extern ncclGin_v11_t nccl_ofi_gin_gdaki_plugin;

#endif /* NCCL_OFI_GIN_GDAKI_H_ */
3 changes: 2 additions & 1 deletion src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ sources += \
rdma/gin/nccl_ofi_gin_allgather.cpp \
rdma/gin/nccl_ofi_gin_api.cpp \
rdma/gin/nccl_ofi_gin_reqs.cpp \
rdma/gin/nccl_ofi_gin_resources.cpp
rdma/gin/nccl_ofi_gin_resources.cpp \
rdma/gin/nccl_ofi_gin_gdaki.cpp
endif

# add the tuner sources into the library
Expand Down
22 changes: 22 additions & 0 deletions src/rdma/gin/nccl_ofi_gin_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@

#include "config.h"

#include <cstring>
Comment thread
bhasunit marked this conversation as resolved.

#include "rdma/gin/nccl_ofi_gin.h"
#include "rdma/gin/nccl_ofi_gin_types.h"
#include "rdma/gin/nccl_ofi_gin_gdaki.h"
#include "nccl_ofi.h"
#include "nccl_ofi_api.h"
#include "nccl_ofi_param.h"

/* Forward declaration — defined at bottom of this file */
extern ncclGin_v11_t ncclGinPlugin_v11;

/**
* Structure to hold GIN context data.
* This is created once per NCCL communicator and passed to all listen() calls
Expand Down Expand Up @@ -69,6 +75,22 @@ static ncclResult_t nccl_ofi_gin_init(void **ctx, uint64_t commId, ncclDebugLogg
return ncclSystemError;
}

/*
* Morph the exported plugin to GDAKI if requested.
*
* Copy shared functions (init, devices, listen, connect) from the
* proxy plugin into the GDAKI plugin, then overwrite the exported
* symbol with the GDAKI plugin.
*/
if (nccl_ofi_gin_gdaki_enabled()) {
NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "gin: GDAKI mode enabled (OFI_NCCL_GIN_GDAKI=1)");
nccl_ofi_gin_gdaki_plugin.init = ncclGinPlugin_v11.init;
nccl_ofi_gin_gdaki_plugin.devices = ncclGinPlugin_v11.devices;
nccl_ofi_gin_gdaki_plugin.listen = ncclGinPlugin_v11.listen;
nccl_ofi_gin_gdaki_plugin.connect = ncclGinPlugin_v11.connect;
memcpy(&ncclGinPlugin_v11, &nccl_ofi_gin_gdaki_plugin, sizeof(ncclGinPlugin_v11));
}

return ncclSuccess;
}

Expand Down
151 changes: 151 additions & 0 deletions src/rdma/gin/nccl_ofi_gin_gdaki.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/*
* Copyright (c) 2026 Amazon.com, Inc. or its affiliates. All rights reserved.
*
* GDAKI stub implementations for the GIN plugin API.
* These provide the full ncclGin_v11_t plugin for GDAKI mode.
*
* Task: GDAKI stub implementation
*/

#include "config.h"

#include "rdma/gin/nccl_ofi_gin_gdaki.h"
#include "nccl_ofi.h"
#include "nccl_ofi_api.h"
#include "nccl_ofi_param.h"

bool nccl_ofi_gin_gdaki_enabled()
{
return ofi_nccl_gin_gdaki.get();
}

static ncclResult_t nccl_ofi_gin_gdaki_get_properties(int dev, ncclNetProperties_v11_t *props)
{
nccl_ofi_properties_t ofi_properties;
ncclResult_t ret = nccl_net_ofi_get_properties(dev, &ofi_properties);
if (ret != ncclSuccess) {
return ret;
}

props->name = ofi_properties.name;
props->pciPath = ofi_properties.pci_path;
props->guid = ofi_properties.guid;
props->ptrSupport = NCCL_PTR_HOST;
if (ofi_properties.hmem_support) {
props->ptrSupport |= NCCL_PTR_CUDA;
}
if (ofi_properties.dmabuf_support) {
props->ptrSupport |= NCCL_PTR_DMABUF;
}

props->regIsGlobal = ofi_properties.regIsGlobal;
props->speed = ofi_properties.port_speed;
props->port = ofi_properties.port_number;
props->latency = ofi_properties.latency;
props->maxComms = ofi_properties.max_communicators;
props->maxRecvs = ofi_properties.max_group_receives;
props->netDeviceType = NCCL_NET_DEVICE_GIN_GDAKI;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
props->vProps.ndevs = 1;
props->vProps.devs[0] = dev;
props->maxP2pBytes = ofi_properties.max_p2p_bytes;
props->maxCollBytes = ofi_properties.max_coll_bytes;

return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_createContext(void *collComm, int nSignals, int nCounters,
void **ginCtx,
ncclNetDeviceHandle_v11_t **devHandle)
{
NCCL_OFI_WARN("gin GDAKI: createContext not yet implemented (nSignals=%d, nCounters=%d)",
nSignals, nCounters);
return ncclInternalError;
}

static ncclResult_t nccl_ofi_gin_gdaki_destroyContext(void *ginCtx)
{
NCCL_OFI_WARN("gin GDAKI: destroyContext not yet implemented");
return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_regMrSym(void *collComm, void *data, size_t size, int type,
uint64_t mrFlags, void **mhandle, void **ginHandle)
{
NCCL_OFI_WARN("gin GDAKI: regMrSym not yet implemented");
return ncclInternalError;
}

static ncclResult_t nccl_ofi_gin_gdaki_regMrSymDmaBuf(void *collComm, void *data, size_t size,
int type, uint64_t offset, int fd,
uint64_t mrFlags, void **mhandle,
void **ginHandle)
{
NCCL_OFI_WARN("gin GDAKI: regMrSymDmaBuf not yet implemented");
return ncclInternalError;
}

static ncclResult_t nccl_ofi_gin_gdaki_deregMrSym(void *collComm, void *mhandle)
{
NCCL_OFI_WARN("gin GDAKI: deregMrSym not yet implemented");
return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_closeColl(void *collComm)
{
NCCL_OFI_WARN("gin GDAKI: closeColl not yet implemented");
return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_closeListen(void *listenComm)
{
NCCL_OFI_WARN("gin GDAKI: closeListen not yet implemented");
return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_ginProgress(void *collComm)
{
NCCL_OFI_WARN("gin GDAKI: ginProgress not yet implemented");
return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_queryLastError(void *ginCtx, bool *hasError)
{
NCCL_OFI_WARN("gin GDAKI: queryLastError not yet implemented");
*hasError = false;
return ncclSuccess;
}

static ncclResult_t nccl_ofi_gin_gdaki_finalize(void *ctx)
{
NCCL_OFI_WARN("gin GDAKI: finalize not yet implemented");
return ncclSuccess;
}

/*
* GDAKI plugin. Function pointers for the ncclGin_v11_t interface:
* - iput, iputSignal, test are nullptr (no CPU involvement in GDAKI mode)
* - init, devices, listen, connect are copied from the proxy plugin at
* init time.
*/
ncclGin_v11_t nccl_ofi_gin_gdaki_plugin = {
.name = "Libfabric_GDAKI",
.init = nullptr, /* Copied from proxy plugin at init time */
.devices = nullptr, /* Copied from proxy plugin at init time */
.getProperties = nccl_ofi_gin_gdaki_get_properties,
.listen = nullptr, /* Copied from proxy plugin at init time */
.connect = nullptr, /* Copied from proxy plugin at init time */
.createContext = nccl_ofi_gin_gdaki_createContext,
.regMrSym = nccl_ofi_gin_gdaki_regMrSym,
.regMrSymDmaBuf = nccl_ofi_gin_gdaki_regMrSymDmaBuf,
.deregMrSym = nccl_ofi_gin_gdaki_deregMrSym,
.destroyContext = nccl_ofi_gin_gdaki_destroyContext,
.closeColl = nccl_ofi_gin_gdaki_closeColl,
.closeListen = nccl_ofi_gin_gdaki_closeListen,
.iput = nullptr, /* No CPU involvement in GDAKI mode */
.iputSignal = nullptr, /* No CPU involvement in GDAKI mode */
.test = nullptr, /* No CPU involvement in GDAKI mode */
.ginProgress = nccl_ofi_gin_gdaki_ginProgress,
.queryLastError = nccl_ofi_gin_gdaki_queryLastError,
.finalize = nccl_ofi_gin_gdaki_finalize
};
Loading