Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Mila::Dnn::Compute::Cuda::TransferOps Struct Referenceexport

CUDA specialization of TensorOps for tensor transfer operations. More...

Static Public Member Functions

template<TensorDataType TSrcDataType, typename TSrcMemoryResource, TensorDataType TDstDataType, typename TDstMemoryResource>
requires isValidTensor<TSrcDataType, TSrcMemoryResource> && isValidTensor<TDstDataType, TDstMemoryResource>
static void copy (const Tensor< TSrcDataType, TSrcMemoryResource > &src, Tensor< TDstDataType, TDstMemoryResource > &dst, IExecutionContext *exec_context=nullptr)
 Copies tensor data with optional ExecutionContext.
template<TensorDataType TDstDataType, typename TDstMemoryResource>
requires isValidTensor<TDstDataType, TDstMemoryResource>
static void copyFromBlob (const Serialization::ITensorBlob &blob, Tensor< TDstDataType, TDstMemoryResource > &dst, IExecutionContext *exec_context=nullptr)
template<TensorDataType TSrcDataType, TensorDataType TDstDataType, typename TDstMemoryResource>
requires isValidTensor<TDstDataType, TDstMemoryResource>
static void copyFromBlobWithConversion (const Serialization::ITensorBlob &blob, Tensor< TDstDataType, TDstMemoryResource > &dst, IExecutionContext *exec_context=nullptr)
 Copy a blob into a CUDA device tensor with element-wise type conversion.

Static Private Member Functions

template<TensorDataType TDataType>
static void copyDeviceToDevice (const void *src_data, void *dst_data, size_t count, cudaStream_t stream, int device_id)
template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
static void copyDeviceToDeviceWithConversion (const void *src_data, void *dst_data, size_t count, cudaStream_t stream, int device_id)
template<TensorDataType TDataType>
static void copyDeviceToHost (const void *src_data, void *dst_data, size_t count, cudaStream_t stream, int device_id)
template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
static void copyDeviceToHostWithConversion (const void *src_data, void *dst_data, size_t count, cudaStream_t stream, int device_id)
template<TensorDataType TDataType>
static void copyHostToDevice (const void *src_data, void *dst_data, size_t count, cudaStream_t stream, int device_id)
template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
static void copyHostToDeviceWithConversion (const void *src_data, void *dst_data, size_t count, cudaStream_t stream, int device_id)
template<TensorDataType TDataType>
static void copyHostToHost (const void *src_data, void *dst_data, size_t count)
template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
static void copyHostToHostWithConversion (const void *src_data, void *dst_data, size_t count)
template<TensorDataType TDataType, typename TMemoryResource>
requires isValidTensor<TDataType, TMemoryResource>
static const void * getDataPointer (const Tensor< TDataType, TMemoryResource > &tensor)
 Gets raw data pointer from tensor.

Detailed Description

CUDA specialization of TensorOps for tensor transfer operations.

Provides CUDA-specific implementations of tensor transfer operations with automatic optimization based on memory types and optional type conversion. Uses zero-overhead borrowing of ExecutionContext for stream control.

Key features:

  • Automatic transfer direction detection (H2D, D2H, D2D, H2H)
  • Optional type conversion during transfer using CUDA kernels
  • Stream-based asynchronous execution
  • Zero-overhead ExecutionContext borrowing (raw pointer)
  • Automatic fallback to default stream with explicit sync
  • Memory-efficient staging for host-device conversions

Member Function Documentation

◆ copy()

template<TensorDataType TSrcDataType, typename TSrcMemoryResource, TensorDataType TDstDataType, typename TDstMemoryResource>
requires isValidTensor<TSrcDataType, TSrcMemoryResource> && isValidTensor<TDstDataType, TDstMemoryResource>
void Mila::Dnn::Compute::Cuda::TransferOps::copy ( const Tensor< TSrcDataType, TSrcMemoryResource > & src,
Tensor< TDstDataType, TDstMemoryResource > & dst,
IExecutionContext * exec_context = nullptr )
inlinestatic

Copies tensor data with optional ExecutionContext.

Transfers data between tensors with automatic optimization based on memory types. Borrows execution context for stream control with zero overhead. Falls back to default stream when no context provided.

Transfer directions:

  • Host to Device (H2D): cudaMemcpyAsync with HostToDevice
  • Device to Host (D2H): cudaMemcpyAsync with DeviceToHost
  • Device to Device (D2D): Optimized kernel copy
  • Host to Host (H2H): std::memcpy

Type conversion:

  • Same type: Direct memory copy (optimal)
  • Different types: CUDA kernel conversion (D2D) or staged conversion (H2D/D2H)
Template Parameters
TSrcDataTypeSource tensor data type
TSrcMemoryResourceSource memory resource type
TDstDataTypeDestination tensor data type
TDstMemoryResourceDestination memory resource type
Parameters
srcSource tensor
dstDestination tensor (must be pre-allocated with matching shape)
exec_contextOptional execution context for stream control (borrowed, not owned)
Exceptions
std::invalid_argumentIf tensor shapes don't match
std::runtime_errorIf CUDA device is invalid or operations fail
Note
exec_context must outlive this function call
When exec_context provided, caller controls synchronization
When exec_context is null, uses default stream and synchronizes before returning
For H2D/D2H with type conversion, uses temporary device staging buffers

Example:

// With explicit context (caller manages sync)
auto ctx = std::make_unique<CudaExecutionContext>(0);
copy(src_tensor, dst_tensor, ctx.get());
// ... queue more operations
ctx->synchronize();
// Without context (automatic sync)
copy(src_tensor, dst_tensor); // Returns after sync completes
static void copy(const Tensor< TSrcDataType, TSrcMemoryResource > &src, Tensor< TDstDataType, TDstMemoryResource > &dst, IExecutionContext *exec_context=nullptr)
Copy tensor data between pre-allocated tensors.
Definition CpuTensorOps.Transfer.ixx:152
Here is the call graph for this function:

◆ copyDeviceToDevice()

template<TensorDataType TDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyDeviceToDevice ( const void * src_data,
void * dst_data,
size_t count,
cudaStream_t stream,
int device_id )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copyDeviceToDeviceWithConversion()

template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyDeviceToDeviceWithConversion ( const void * src_data,
void * dst_data,
size_t count,
cudaStream_t stream,
int device_id )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copyDeviceToHost()

template<TensorDataType TDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyDeviceToHost ( const void * src_data,
void * dst_data,
size_t count,
cudaStream_t stream,
int device_id )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copyDeviceToHostWithConversion()

template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyDeviceToHostWithConversion ( const void * src_data,
void * dst_data,
size_t count,
cudaStream_t stream,
int device_id )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copyFromBlob()

template<TensorDataType TDstDataType, typename TDstMemoryResource>
requires isValidTensor<TDstDataType, TDstMemoryResource>
void Mila::Dnn::Compute::Cuda::TransferOps::copyFromBlob ( const Serialization::ITensorBlob & blob,
Tensor< TDstDataType, TDstMemoryResource > & dst,
IExecutionContext * exec_context = nullptr )
inlinestatic
Here is the call graph for this function:

◆ copyFromBlobWithConversion()

template<TensorDataType TSrcDataType, TensorDataType TDstDataType, typename TDstMemoryResource>
requires isValidTensor<TDstDataType, TDstMemoryResource>
void Mila::Dnn::Compute::Cuda::TransferOps::copyFromBlobWithConversion ( const Serialization::ITensorBlob & blob,
Tensor< TDstDataType, TDstMemoryResource > & dst,
IExecutionContext * exec_context = nullptr )
inlinestatic

Copy a blob into a CUDA device tensor with element-wise type conversion.

Blob carries TSrcDataType host elements. The converting H2D transfer stages the source into a temporary device buffer then runs launch_convert_copy_kernel to cast each element to TDstDataType in-place, avoiding a second pass.

Template Parameters
TSrcDataTypeBlob element dtype (e.g. BF16).
TDstDataTypeDestination tensor dtype (e.g. FP8_E4M3).
TDstMemoryResourceDestination memory resource (CUDA device memory).
Here is the call graph for this function:

◆ copyHostToDevice()

template<TensorDataType TDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyHostToDevice ( const void * src_data,
void * dst_data,
size_t count,
cudaStream_t stream,
int device_id )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copyHostToDeviceWithConversion()

template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyHostToDeviceWithConversion ( const void * src_data,
void * dst_data,
size_t count,
cudaStream_t stream,
int device_id )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copyHostToHost()

template<TensorDataType TDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyHostToHost ( const void * src_data,
void * dst_data,
size_t count )
inlinestaticprivate
Here is the caller graph for this function:

◆ copyHostToHostWithConversion()

template<TensorDataType TSrcDataType, TensorDataType TDstDataType>
void Mila::Dnn::Compute::Cuda::TransferOps::copyHostToHostWithConversion ( const void * src_data,
void * dst_data,
size_t count )
inlinestaticprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getDataPointer()

template<TensorDataType TDataType, typename TMemoryResource>
requires isValidTensor<TDataType, TMemoryResource>
const void * Mila::Dnn::Compute::Cuda::TransferOps::getDataPointer ( const Tensor< TDataType, TMemoryResource > & tensor)
inlinestaticprivate

Gets raw data pointer from tensor.

Uses public data() method for host-accessible tensors. For device-only tensors, uses buffer's data() via protected access.

Here is the call graph for this function:

The documentation for this struct was generated from the following file: