Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Mila::Dnn::Compute::Cuda::Kernels Namespace Reference

Functions

template<typename T>
void launch_abs_kernel (const T *src, T *dst, size_t n, cudaStream_t stream)
 Launch absolute value: dst = abs(src).
template<typename T>
void launch_elementwise_add_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise tensor addition: dst = src1 + src2.
template<typename T>
void launch_elementwise_divide_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise tensor division: dst = src1 / src2.
template<typename T>
void launch_elementwise_equal_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise equality: dst = (src1 == src2) ?
template<typename T>
void launch_elementwise_greater_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise greater than: dst = (src1 > src2) ?
template<typename T>
void launch_elementwise_less_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise less than: dst = (src1 < src2) ?
template<typename T>
void launch_elementwise_max_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise maximum: dst = max(src1, src2).
template<typename T>
void launch_elementwise_min_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise minimum: dst = min(src1, src2).
template<typename T>
void launch_elementwise_multiply_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise tensor multiplication: dst = src1 * src2.
template<typename T>
void launch_elementwise_subtract_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream)
 Launch element-wise tensor subtraction: dst = src1 - src2.
template<typename T>
void launch_max_reduction_kernel (const T *src, T *d_partial_maxes, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
 Launch max reduction kernel producing per-block partial maxima.
template<typename T>
void launch_mean_reduction_kernel (const T *src, float *d_partial_means, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
 Launch mean reduction kernel producing per-block partial sums (float).
template<typename T>
void launch_min_reduction_kernel (const T *src, T *d_partial_mins, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
 Launch min reduction kernel producing per-block partial minima.
template<typename T>
void launch_negate_kernel (const T *src, T *dst, size_t n, cudaStream_t stream)
 Launch negation: dst = -src.
template<typename T>
void launch_scalar_add_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream)
 Launch scalar addition: dst = src + scalar.
template<typename T>
void launch_scalar_divide_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream)
 Launch scalar division: dst = src / scalar.
template<typename T>
void launch_scalar_multiply_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream)
 Launch scalar multiplication: dst = src * scalar.
template<typename T>
void launch_scalar_subtract_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream)
 Launch scalar subtraction: dst = src - scalar.
template<typename T>
void launch_sqrt_kernel (const T *src, T *dst, size_t n, cudaStream_t stream)
 Launch square root: dst = sqrt(src).
template<typename T>
void launch_square_kernel (const T *src, T *dst, size_t n, cudaStream_t stream)
 Launch square: dst = src * src.
template<typename T>
void launch_sum_reduction_kernel (const T *src, float *d_partial_sums, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
 Launch sum reduction kernel producing per-block partial sums (float).

Function Documentation

◆ launch_abs_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_abs_kernel ( const T * src,
T * dst,
size_t n,
cudaStream_t stream )

Launch absolute value: dst = abs(src).

◆ launch_elementwise_add_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_add_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise tensor addition: dst = src1 + src2.

Parameters
src1First source tensor data
src2Second source tensor data
dstDestination tensor data
nNumber of elements to process
streamCUDA stream for kernel execution
Here is the caller graph for this function:

◆ launch_elementwise_divide_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_divide_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise tensor division: dst = src1 / src2.

◆ launch_elementwise_equal_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_equal_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise equality: dst = (src1 == src2) ?

1 : 0

◆ launch_elementwise_greater_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_greater_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise greater than: dst = (src1 > src2) ?

1 : 0

◆ launch_elementwise_less_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_less_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise less than: dst = (src1 < src2) ?

1 : 0

◆ launch_elementwise_max_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_max_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise maximum: dst = max(src1, src2).

◆ launch_elementwise_min_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_min_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise minimum: dst = min(src1, src2).

◆ launch_elementwise_multiply_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_multiply_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise tensor multiplication: dst = src1 * src2.

Here is the caller graph for this function:

◆ launch_elementwise_subtract_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_subtract_kernel ( const T * src1,
const T * src2,
T * dst,
size_t n,
cudaStream_t stream )

Launch element-wise tensor subtraction: dst = src1 - src2.

Here is the caller graph for this function:

◆ launch_max_reduction_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_max_reduction_kernel ( const T * src,
T * d_partial_maxes,
size_t count,
int grid,
int block,
size_t shared_bytes,
cudaStream_t stream )

Launch max reduction kernel producing per-block partial maxima.

Template Parameters
TSource and partial-result element type
Parameters
srcDevice pointer to source elements
d_partial_maxesDevice pointer to per-block partial maxima (T)
countNumber of elements in source
gridNumber of blocks to launch (partial-result entries)
blockThreads per block
shared_bytesShared memory size per block (bytes)
streamCUDA stream for kernel execution

◆ launch_mean_reduction_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_mean_reduction_kernel ( const T * src,
float * d_partial_means,
size_t count,
int grid,
int block,
size_t shared_bytes,
cudaStream_t stream )

Launch mean reduction kernel producing per-block partial sums (float).

Partial sums are written to the provided device buffer; caller computes final mean by dividing the aggregated sum by the element count.

◆ launch_min_reduction_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_min_reduction_kernel ( const T * src,
T * d_partial_mins,
size_t count,
int grid,
int block,
size_t shared_bytes,
cudaStream_t stream )

Launch min reduction kernel producing per-block partial minima.

Template Parameters
TSource and partial-result element type

◆ launch_negate_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_negate_kernel ( const T * src,
T * dst,
size_t n,
cudaStream_t stream )

Launch negation: dst = -src.

◆ launch_scalar_add_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_add_kernel ( const T * src,
T * dst,
T scalar,
size_t n,
cudaStream_t stream )

Launch scalar addition: dst = src + scalar.

Parameters
srcSource tensor data
dstDestination tensor data
scalarScalar value to add
nNumber of elements to process
streamCUDA stream for kernel execution

◆ launch_scalar_divide_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_divide_kernel ( const T * src,
T * dst,
T scalar,
size_t n,
cudaStream_t stream )

Launch scalar division: dst = src / scalar.

◆ launch_scalar_multiply_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_multiply_kernel ( const T * src,
T * dst,
T scalar,
size_t n,
cudaStream_t stream )

Launch scalar multiplication: dst = src * scalar.

◆ launch_scalar_subtract_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_subtract_kernel ( const T * src,
T * dst,
T scalar,
size_t n,
cudaStream_t stream )

Launch scalar subtraction: dst = src - scalar.

◆ launch_sqrt_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_sqrt_kernel ( const T * src,
T * dst,
size_t n,
cudaStream_t stream )

Launch square root: dst = sqrt(src).

◆ launch_square_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_square_kernel ( const T * src,
T * dst,
size_t n,
cudaStream_t stream )

Launch square: dst = src * src.

◆ launch_sum_reduction_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::Kernels::launch_sum_reduction_kernel ( const T * src,
float * d_partial_sums,
size_t count,
int grid,
int block,
size_t shared_bytes,
cudaStream_t stream )

Launch sum reduction kernel producing per-block partial sums (float).

Template Parameters
TSource element type
Parameters
srcDevice pointer to source elements
d_partial_sumsDevice pointer to per-block partial sums (float)
countNumber of elements in source
gridNumber of blocks to launch (partial-sum entries)
blockThreads per block
shared_bytesShared memory size per block (bytes)
streamCUDA stream for kernel execution