Mila
Deep Neural Network Library
Loading...
Searching...
No Matches
CudaOps.h File Reference

CUDA kernel function declarations for neural network operations. More...

#include <cublasLt.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

Go to the source code of this file.

Namespaces

namespace  Mila
 
namespace  Mila::Dnn
 
namespace  Mila::Dnn::Compute
 

Functions

void Mila::Dnn::Compute::cuda_encoder_forward_fp16 (half *Y, const int *X, const half *wte, const half *wpe, int B, int T, int C, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_encoder_forward_fp32 (float *Y, const int *X, const float *wte, const float *wpe, int B, int T, int C, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_gelu_backward_fp16 (half *dX, const half *X, const half *dY, const int N, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_gelu_backward_fp32 (float *dX, const float *X, const float *dY, const int N, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_gelu_forward_fp16 (half *Y, const half *X, int N, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_gelu_forward_fp32 (float *Y, const float *X, int N, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_layernorm_forward_fp16 (half *Y, half *mean, half *rstd, const half *X, const half *weight, const half *bias, int B, int T, int C, float epsilon, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_layernorm_forward_fp32 (float *Y, float *mean, float *rstd, const float *X, const float *weight, const float *bias, int B, int T, int C, float epsilon, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_matmul_forward_fp16 (half *Y, const half *X, const half *weight, const half *bias, int outer_size, int C, int OC, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_matmul_forward_fp32 (float *Y, const float *X, const float *weight, const float *bias, int outer_size, int C, int OC, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_mha_forward_fp16 (half *Y, half *qkvr, half *att, const half *X, int B, int T, int C, int NH, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_mha_forward_fp32 (float *Y, float *qkvr, float *att, const float *X, int B, int T, int C, int NH, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_residual_forward_fp16 (half *Y, const half *X1, const half *X2, int N, cudaStream_t stream)
 
void Mila::Dnn::Compute::cuda_residual_forward_fp32 (float *Y, const float *X1, const float *X2, int N, cudaStream_t stream)
 
template<typename TPrecision >
void Mila::Dnn::Compute::cuda_softmax_crossentropy_backward (TPrecision *dlogits, const TPrecision *dlosses, const TPrecision *probs, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)
 
template<typename TPrecision >
void Mila::Dnn::Compute::cuda_softmax_crossentropy_forward (TPrecision *losses, TPrecision *probs, const TPrecision *logits, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)
 
template<typename TPrecision >
void Mila::Dnn::Compute::cuda_softmax_forward (TPrecision *Y, const TPrecision *X, int N, int C, cudaStream_t stream)
 
template<typename TPrecision >
void Mila::Dnn::Compute::cuda_softmax_forward_general (TPrecision *Y, const TPrecision *X, int outer_size, int dim_size, int inner_size, cudaStream_t stream)
 

Detailed Description

CUDA kernel function declarations for neural network operations.

This header file declares CUDA kernel functions that implement various neural network operations optimized for execution on NVIDIA GPUs. The operations include:

  • Encoder operations for embedding input tokens
  • GELU activation functions
  • Layer normalization
  • Matrix multiplication
  • Multi-head attention
  • Softmax activation
  • Residual connections
  • Attention mechanisms
  • Fused Softmax Cross Entropy operations

Each operation provides implementations for both single-precision (fp32) and half-precision (fp16) floating point data types to support different performance and accuracy requirements. These functions serve as the computational backend for the neural network operations in the Mila deep learning framework.