|
Mila 0.13.48
Deep Neural Network Library
|
Functions | |
| template<typename T> | |
| void | launch_abs_kernel (const T *src, T *dst, size_t n, cudaStream_t stream) |
| Launch absolute value: dst = abs(src). | |
| template<typename T> | |
| void | launch_elementwise_add_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise tensor addition: dst = src1 + src2. | |
| template<typename T> | |
| void | launch_elementwise_divide_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise tensor division: dst = src1 / src2. | |
| template<typename T> | |
| void | launch_elementwise_equal_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise equality: dst = (src1 == src2) ? | |
| template<typename T> | |
| void | launch_elementwise_greater_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise greater than: dst = (src1 > src2) ? | |
| template<typename T> | |
| void | launch_elementwise_less_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise less than: dst = (src1 < src2) ? | |
| template<typename T> | |
| void | launch_elementwise_max_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise maximum: dst = max(src1, src2). | |
| template<typename T> | |
| void | launch_elementwise_min_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise minimum: dst = min(src1, src2). | |
| template<typename T> | |
| void | launch_elementwise_multiply_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise tensor multiplication: dst = src1 * src2. | |
| template<typename T> | |
| void | launch_elementwise_subtract_kernel (const T *src1, const T *src2, T *dst, size_t n, cudaStream_t stream) |
| Launch element-wise tensor subtraction: dst = src1 - src2. | |
| template<typename T> | |
| void | launch_max_reduction_kernel (const T *src, T *d_partial_maxes, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream) |
| Launch max reduction kernel producing per-block partial maxima. | |
| template<typename T> | |
| void | launch_mean_reduction_kernel (const T *src, float *d_partial_means, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream) |
| Launch mean reduction kernel producing per-block partial sums (float). | |
| template<typename T> | |
| void | launch_min_reduction_kernel (const T *src, T *d_partial_mins, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream) |
| Launch min reduction kernel producing per-block partial minima. | |
| template<typename T> | |
| void | launch_negate_kernel (const T *src, T *dst, size_t n, cudaStream_t stream) |
| Launch negation: dst = -src. | |
| template<typename T> | |
| void | launch_scalar_add_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream) |
| Launch scalar addition: dst = src + scalar. | |
| template<typename T> | |
| void | launch_scalar_divide_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream) |
| Launch scalar division: dst = src / scalar. | |
| template<typename T> | |
| void | launch_scalar_multiply_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream) |
| Launch scalar multiplication: dst = src * scalar. | |
| template<typename T> | |
| void | launch_scalar_subtract_kernel (const T *src, T *dst, T scalar, size_t n, cudaStream_t stream) |
| Launch scalar subtraction: dst = src - scalar. | |
| template<typename T> | |
| void | launch_sqrt_kernel (const T *src, T *dst, size_t n, cudaStream_t stream) |
| Launch square root: dst = sqrt(src). | |
| template<typename T> | |
| void | launch_square_kernel (const T *src, T *dst, size_t n, cudaStream_t stream) |
| Launch square: dst = src * src. | |
| template<typename T> | |
| void | launch_sum_reduction_kernel (const T *src, float *d_partial_sums, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream) |
| Launch sum reduction kernel producing per-block partial sums (float). | |
| void Mila::Dnn::Compute::Cuda::Kernels::launch_abs_kernel | ( | const T * | src, |
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch absolute value: dst = abs(src).
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_add_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise tensor addition: dst = src1 + src2.
| src1 | First source tensor data |
| src2 | Second source tensor data |
| dst | Destination tensor data |
| n | Number of elements to process |
| stream | CUDA stream for kernel execution |

| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_divide_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise tensor division: dst = src1 / src2.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_equal_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise equality: dst = (src1 == src2) ?
1 : 0
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_greater_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise greater than: dst = (src1 > src2) ?
1 : 0
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_less_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise less than: dst = (src1 < src2) ?
1 : 0
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_max_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise maximum: dst = max(src1, src2).
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_min_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise minimum: dst = min(src1, src2).
| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_multiply_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise tensor multiplication: dst = src1 * src2.

| void Mila::Dnn::Compute::Cuda::Kernels::launch_elementwise_subtract_kernel | ( | const T * | src1, |
| const T * | src2, | ||
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch element-wise tensor subtraction: dst = src1 - src2.

| void Mila::Dnn::Compute::Cuda::Kernels::launch_max_reduction_kernel | ( | const T * | src, |
| T * | d_partial_maxes, | ||
| size_t | count, | ||
| int | grid, | ||
| int | block, | ||
| size_t | shared_bytes, | ||
| cudaStream_t | stream ) |
Launch max reduction kernel producing per-block partial maxima.
| T | Source and partial-result element type |
| src | Device pointer to source elements |
| d_partial_maxes | Device pointer to per-block partial maxima (T) |
| count | Number of elements in source |
| grid | Number of blocks to launch (partial-result entries) |
| block | Threads per block |
| shared_bytes | Shared memory size per block (bytes) |
| stream | CUDA stream for kernel execution |
| void Mila::Dnn::Compute::Cuda::Kernels::launch_mean_reduction_kernel | ( | const T * | src, |
| float * | d_partial_means, | ||
| size_t | count, | ||
| int | grid, | ||
| int | block, | ||
| size_t | shared_bytes, | ||
| cudaStream_t | stream ) |
Launch mean reduction kernel producing per-block partial sums (float).
Partial sums are written to the provided device buffer; caller computes final mean by dividing the aggregated sum by the element count.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_min_reduction_kernel | ( | const T * | src, |
| T * | d_partial_mins, | ||
| size_t | count, | ||
| int | grid, | ||
| int | block, | ||
| size_t | shared_bytes, | ||
| cudaStream_t | stream ) |
Launch min reduction kernel producing per-block partial minima.
| T | Source and partial-result element type |
| void Mila::Dnn::Compute::Cuda::Kernels::launch_negate_kernel | ( | const T * | src, |
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch negation: dst = -src.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_add_kernel | ( | const T * | src, |
| T * | dst, | ||
| T | scalar, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch scalar addition: dst = src + scalar.
| src | Source tensor data |
| dst | Destination tensor data |
| scalar | Scalar value to add |
| n | Number of elements to process |
| stream | CUDA stream for kernel execution |
| void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_divide_kernel | ( | const T * | src, |
| T * | dst, | ||
| T | scalar, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch scalar division: dst = src / scalar.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_multiply_kernel | ( | const T * | src, |
| T * | dst, | ||
| T | scalar, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch scalar multiplication: dst = src * scalar.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_scalar_subtract_kernel | ( | const T * | src, |
| T * | dst, | ||
| T | scalar, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch scalar subtraction: dst = src - scalar.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_sqrt_kernel | ( | const T * | src, |
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch square root: dst = sqrt(src).
| void Mila::Dnn::Compute::Cuda::Kernels::launch_square_kernel | ( | const T * | src, |
| T * | dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch square: dst = src * src.
| void Mila::Dnn::Compute::Cuda::Kernels::launch_sum_reduction_kernel | ( | const T * | src, |
| float * | d_partial_sums, | ||
| size_t | count, | ||
| int | grid, | ||
| int | block, | ||
| size_t | shared_bytes, | ||
| cudaStream_t | stream ) |
Launch sum reduction kernel producing per-block partial sums (float).
| T | Source element type |
| src | Device pointer to source elements |
| d_partial_sums | Device pointer to per-block partial sums (float) |
| count | Number of elements in source |
| grid | Number of blocks to launch (partial-sum entries) |
| block | Threads per block |
| shared_bytes | Shared memory size per block (bytes) |
| stream | CUDA stream for kernel execution |