Mila/CudaOps_8h_source.html

#pragma once


#include <cublasLt.h>

#include <cuda_runtime.h>

#include <cuda_fp16.h>


namespace Mila::Dnn::Compute

{

    // Attention functions

    void cuda_mha_forward_fp32(

        float* Y,

        float* qkvr, float* att,

        const float* X,

        int B, int T, int C, int NH,

        cudaStream_t stream );


    void cuda_mha_forward_fp16(

        half* Y,

        half* qkvr, half* att,

        const half* X,

        int B, int T, int C, int NH,

        cudaStream_t stream );


    // Encoder functions

    void cuda_encoder_forward_fp32(

        float* Y, const int* X,

        const float* wte, const float* wpe,

        int B, int T, int C,

        cudaStream_t stream );


    void cuda_encoder_forward_fp16(

        half* Y, const int* X,

        const half* wte, const half* wpe,

        int B, int T, int C,

        cudaStream_t stream );


    // GELU functions

    void cuda_gelu_forward_fp32(

        float* Y,

        const float* X,

        int N,

        cudaStream_t stream );


    void cuda_gelu_backward_fp32(

        float* dX,

        const float* X,

        const float* dY,

        const int N,

        cudaStream_t stream );


    void cuda_gelu_forward_fp16(

        half* Y,

        const half* X,

        int N,

        cudaStream_t stream );


    void cuda_gelu_backward_fp16(

        half* dX,

        const half* X,

        const half* dY,

        const int N,

        cudaStream_t stream );


    // LayerNorm functions

    void cuda_layernorm_forward_fp32(

        float* Y,

        float* mean, float* rstd,

        const float* X,

        const float* weight, const float* bias,

        int B, int T, int C, float epsilon,

        cudaStream_t stream );


    void cuda_layernorm_forward_fp16(

        half* Y,

        half* mean, half* rstd,

        const half* X,

        const half* weight, const half* bias,

        int B, int T, int C, float epsilon,

        cudaStream_t stream );


    // Matmul functions

    void cuda_matmul_forward_fp32(

        float* Y,

        const float* X,

        const float* weight, const float* bias,

        int outer_size, int C, int OC,

        cudaStream_t stream );


    void cuda_matmul_forward_fp16(

        half* Y, const half* X,

        const half* weight, const half* bias,

        int outer_size, int C, int OC,

        cudaStream_t stream );


    // Softmax functions

    template <typename TPrecision>

    void cuda_softmax_forward(

        TPrecision* Y,

        const TPrecision* X,

        int N,

        int C,

        cudaStream_t stream );


    template <typename TPrecision>

    void cuda_softmax_forward_general(

        TPrecision* Y,

        const TPrecision* X,

        int outer_size,

        int dim_size,

        int inner_size,

        cudaStream_t stream );


    // SoftmaxCrossEntropy functions

    template <typename TPrecision>

    void cuda_softmax_crossentropy_forward(

        TPrecision* losses,

        TPrecision* probs,

        const TPrecision* logits,

        const int* targets,

        int batch_size,

        int seq_len,

        int vocab_size,

        cudaStream_t stream );


    template <typename TPrecision>

    void cuda_softmax_crossentropy_backward(

        TPrecision* dlogits,

        const TPrecision* dlosses,

        const TPrecision* probs,

        const int* targets,

        int batch_size,

        int seq_len,

        int vocab_size,

        cudaStream_t stream );


    // Residual functions

    void cuda_residual_forward_fp32(

        float* Y,

        const float* X1, const float* X2,

        int N,

        cudaStream_t stream );


    void cuda_residual_forward_fp16(

        half* Y,

        const half* X1, const half* X2,

        int N,

        cudaStream_t stream );


}

Mila::Dnn::Compute
Definition PrecisionConfig.ixx:13

Mila::Dnn::Compute::cuda_layernorm_forward_fp16
void cuda_layernorm_forward_fp16(half *Y, half *mean, half *rstd, const half *X, const half *weight, const half *bias, int B, int T, int C, float epsilon, cudaStream_t stream)

Mila::Dnn::Compute::cuda_gelu_forward_fp32
void cuda_gelu_forward_fp32(float *Y, const float *X, int N, cudaStream_t stream)

Mila::Dnn::Compute::cuda_gelu_forward_fp16
void cuda_gelu_forward_fp16(half *Y, const half *X, int N, cudaStream_t stream)

Mila::Dnn::Compute::cuda_encoder_forward_fp16
void cuda_encoder_forward_fp16(half *Y, const int *X, const half *wte, const half *wpe, int B, int T, int C, cudaStream_t stream)

Mila::Dnn::Compute::cuda_mha_forward_fp32
void cuda_mha_forward_fp32(float *Y, float *qkvr, float *att, const float *X, int B, int T, int C, int NH, cudaStream_t stream)

Mila::Dnn::Compute::cuda_softmax_forward_general
void cuda_softmax_forward_general(TPrecision *Y, const TPrecision *X, int outer_size, int dim_size, int inner_size, cudaStream_t stream)

Mila::Dnn::Compute::cuda_encoder_forward_fp32
void cuda_encoder_forward_fp32(float *Y, const int *X, const float *wte, const float *wpe, int B, int T, int C, cudaStream_t stream)

Mila::Dnn::Compute::cuda_gelu_backward_fp32
void cuda_gelu_backward_fp32(float *dX, const float *X, const float *dY, const int N, cudaStream_t stream)

Mila::Dnn::Compute::cuda_gelu_backward_fp16
void cuda_gelu_backward_fp16(half *dX, const half *X, const half *dY, const int N, cudaStream_t stream)

Mila::Dnn::Compute::cuda_matmul_forward_fp16
void cuda_matmul_forward_fp16(half *Y, const half *X, const half *weight, const half *bias, int outer_size, int C, int OC, cudaStream_t stream)

Mila::Dnn::Compute::cuda_softmax_crossentropy_backward
void cuda_softmax_crossentropy_backward(TPrecision *dlogits, const TPrecision *dlosses, const TPrecision *probs, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)

Mila::Dnn::Compute::cuda_matmul_forward_fp32
void cuda_matmul_forward_fp32(float *Y, const float *X, const float *weight, const float *bias, int outer_size, int C, int OC, cudaStream_t stream)

Mila::Dnn::Compute::cuda_softmax_forward
void cuda_softmax_forward(TPrecision *Y, const TPrecision *X, int N, int C, cudaStream_t stream)

Mila::Dnn::Compute::cuda_layernorm_forward_fp32
void cuda_layernorm_forward_fp32(float *Y, float *mean, float *rstd, const float *X, const float *weight, const float *bias, int B, int T, int C, float epsilon, cudaStream_t stream)

Mila::Dnn::Compute::cuda_residual_forward_fp32
void cuda_residual_forward_fp32(float *Y, const float *X1, const float *X2, int N, cudaStream_t stream)

Mila::Dnn::Compute::cuda_softmax_crossentropy_forward
void cuda_softmax_crossentropy_forward(TPrecision *losses, TPrecision *probs, const TPrecision *logits, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)

Mila::Dnn::Compute::cuda_residual_forward_fp16
void cuda_residual_forward_fp16(half *Y, const half *X1, const half *X2, int N, cudaStream_t stream)

Mila::Dnn::Compute::cuda_mha_forward_fp16
void cuda_mha_forward_fp16(half *Y, half *qkvr, half *att, const half *X, int B, int T, int C, int NH, cudaStream_t stream)