diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs
index 425fd74f76..5f36e28f42 100644
--- a/candle-core/src/cuda_backend/device.rs
+++ b/candle-core/src/cuda_backend/device.rs
@@ -10,6 +10,25 @@ use std::sync::{Arc, Mutex, RwLock};
 
 use super::{CudaError, CudaStorage, CudaStorageSlice, WrapErr};
 
+/// Enable peer access from the *currently bound* CUDA context to the
+/// supplied peer context. Caller is responsible for ensuring the right
+/// context is current (`bind_to_thread`) before calling.
+///
+/// Treats `CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED` (704) as success so
+/// the public `enable_peer_access` is safely idempotent across repeat
+/// calls between the same context pair. Other errors surface via
+/// `WrapErr`.
+fn enable_peer_access_one_way(peer_ctx: cudarc::driver::sys::CUcontext) -> Result<()> {
+    let res = unsafe { cudarc::driver::sys::cuCtxEnablePeerAccess(peer_ctx, 0) };
+    if res == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS
+        || res == cudarc::driver::sys::cudaError_enum::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
+    {
+        Ok(())
+    } else {
+        Err(cudarc::driver::DriverError(res)).w()
+    }
+}
+
 /// Unique identifier for cuda devices.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub struct DeviceId(usize);
@@ -173,6 +192,58 @@ impl CudaDevice {
         self.context.is_event_tracking()
     }
 
+    /// Enable peer access between this device's context and another's, in
+    /// both directions, so that GPU-direct cross-card tensor operations
+    /// (`memcpy_peer_async` / `Tensor::to_device(&other_cuda)`) can route
+    /// over NVLink (or PCIe P2P) instead of erroring with
+    /// `CUDA_ERROR_INVALID_CONTEXT`.
+    ///
+    /// Idempotent in two senses:
+    /// - Calling on the same `CudaDevice` (i.e. same ordinal) is a no-op,
+    ///   returning `Ok(())`. Same-context "peer" access is meaningless.
+    /// - Calling repeatedly between the same two contexts is safe.
+    ///   `cuCtxEnablePeerAccess` returns `CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`
+    ///   on the second call, which this method maps to `Ok(())`.
+    ///
+    /// Operators that hold multiple `CudaDevice` instances on different
+    /// ordinals must call this between every pair they intend to do
+    /// cross-card transfers across. Without it, cudarc's safe
+    /// `memcpy_dtod` correctly dispatches to `memcpy_peer_async` when
+    /// source and destination contexts differ — but `memcpy_peer_async`
+    /// requires peer access to have been enabled first, otherwise the
+    /// driver rejects with `CUDA_ERROR_INVALID_CONTEXT`.
+    ///
+    /// # Errors
+    ///
+    /// Returns the underlying CUDA error if the device pair doesn't
+    /// support peer access (e.g. some heterogeneous or
+    /// IOMMU-isolated configurations) or if either context is in a
+    /// terminal state. Check `cuDeviceCanAccessPeer` separately if you
+    /// need to probe support before attempting to enable it.
+    pub fn enable_peer_access(&self, other: &Self) -> Result<()> {
+        let self_ord = self.context.ordinal();
+        let other_ord = other.context.ordinal();
+        if self_ord == other_ord {
+            // Same physical device. No peer to enable; not an error.
+            return Ok(());
+        }
+        let self_ctx = self.context.cu_ctx();
+        let other_ctx = other.context.cu_ctx();
+        // self ←→ other: enable each direction. The driver rejects
+        // `cuCtxEnablePeerAccess` with PEER_ACCESS_ALREADY_ENABLED if
+        // the call is a no-op (already enabled); fold that into Ok(())
+        // so the helper is safely idempotent across repeat calls.
+        self.context.bind_to_thread().w()?;
+        enable_peer_access_one_way(other_ctx)?;
+        other.context.bind_to_thread().w()?;
+        enable_peer_access_one_way(self_ctx)?;
+        // Restore self as current — bind_to_thread above pushed `other`
+        // onto this OS thread; callers that proceed with `self` work
+        // immediately after this method shouldn't have to re-bind.
+        self.context.bind_to_thread().w()?;
+        Ok(())
+    }
+
     #[cfg(all(feature = "ug", not(target_arch = "wasm32")))]
     pub fn compile(
         &self,
diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs
index d0167c61e9..9242fa44a9 100644
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@@ -255,6 +255,31 @@ impl Device {
         Ok(Self::Cuda(crate::CudaDevice::new_with_stream(ordinal)?))
     }
 
+    /// Enable bidirectional peer access between two CUDA devices on
+    /// different ordinals so GPU-direct cross-card tensor operations
+    /// (`Tensor::to_device(&other_cuda)` →
+    /// `cudarc::CudaStream::clone_dtod` → `memcpy_peer_async`) succeed.
+    ///
+    /// Idempotent: same-ordinal pairs and already-enabled pairs both
+    /// return `Ok(())`. See
+    /// [`crate::CudaDevice::enable_peer_access`] for the underlying
+    /// semantics.
+    ///
+    /// Returns an error if either device is not CUDA (e.g. `Cpu` or
+    /// `Metal`), or if the underlying driver call rejects the request
+    /// (peer access unsupported on this hardware pair, etc.).
+    #[cfg(feature = "cuda")]
+    pub fn enable_peer_access(&self, other: &Self) -> Result<()> {
+        match (self, other) {
+            (Self::Cuda(a), Self::Cuda(b)) => a.enable_peer_access(b),
+            _ => crate::bail!(
+                "enable_peer_access requires two CUDA devices, got {:?} and {:?}",
+                self.location(),
+                other.location()
+            ),
+        }
+    }
+
     pub fn new_metal(ordinal: usize) -> Result<Self> {
         Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
     }