CUDA kernel declarations for tensor reduction operations (sum, mean, max, min). More...

#include <cuda_runtime.h>
#include <cstddef>

Namespaces
namespace	Mila
	Mila main API namespace.
namespace	Mila::Dnn
namespace	Mila::Dnn::Compute
namespace	Mila::Dnn::Compute::Cuda
namespace	Mila::Dnn::Compute::Cuda::Kernels

Functions
template<typename T>
void	Mila::Dnn::Compute::Cuda::Kernels::launch_max_reduction_kernel (const T src, T d_partial_maxes, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
	Launch max reduction kernel producing per-block partial maxima.
template<typename T>
void	Mila::Dnn::Compute::Cuda::Kernels::launch_mean_reduction_kernel (const T src, float d_partial_means, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
	Launch mean reduction kernel producing per-block partial sums (float).
template<typename T>
void	Mila::Dnn::Compute::Cuda::Kernels::launch_min_reduction_kernel (const T src, T d_partial_mins, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
	Launch min reduction kernel producing per-block partial minima.
template<typename T>
void	Mila::Dnn::Compute::Cuda::Kernels::launch_sum_reduction_kernel (const T src, float d_partial_sums, size_t count, int grid, int block, size_t shared_bytes, cudaStream_t stream)
	Launch sum reduction kernel producing per-block partial sums (float).

Detailed Description

CUDA kernel declarations for tensor reduction operations (sum, mean, max, min).

Declares launch functions for optimized CUDA reduction kernels. Each kernel writes per-block partial results to a device buffer; a host-side final reduction is expected by the caller (or a follow-up kernel).

Namespaces

Functions

Detailed Description