25#include <cuda_runtime.h>
33 float* qkvr,
float* att,
35 int B,
int T,
int C,
int NH,
36 cudaStream_t stream );
40 half* qkvr, half* att,
42 int B,
int T,
int C,
int NH,
43 cudaStream_t stream );
47 float* Y,
const int* X,
48 const float* wte,
const float* wpe,
50 cudaStream_t stream );
53 half* Y,
const int* X,
54 const half* wte,
const half* wpe,
56 cudaStream_t stream );
63 cudaStream_t stream );
70 cudaStream_t stream );
76 cudaStream_t stream );
83 cudaStream_t stream );
88 float* mean,
float* rstd,
90 const float* weight,
const float* bias,
91 int B,
int T,
int C,
float epsilon,
92 cudaStream_t stream );
96 half* mean, half* rstd,
98 const half* weight,
const half* bias,
99 int B,
int T,
int C,
float epsilon,
100 cudaStream_t stream );
106 const float* weight,
const float* bias,
107 int outer_size,
int C,
int OC,
108 cudaStream_t stream );
111 half* Y,
const half* X,
112 const half* weight,
const half* bias,
113 int outer_size,
int C,
int OC,
114 cudaStream_t stream );
117 template <
typename TPrecision>
123 cudaStream_t stream );
125 template <
typename TPrecision>
132 cudaStream_t stream );
135 template <
typename TPrecision>
139 const TPrecision* logits,
144 cudaStream_t stream );
146 template <
typename TPrecision>
149 const TPrecision* dlosses,
150 const TPrecision* probs,
155 cudaStream_t stream );
160 const float* X1,
const float* X2,
162 cudaStream_t stream );
166 const half* X1,
const half* X2,
168 cudaStream_t stream );
Definition PrecisionConfig.ixx:13
void cuda_layernorm_forward_fp16(half *Y, half *mean, half *rstd, const half *X, const half *weight, const half *bias, int B, int T, int C, float epsilon, cudaStream_t stream)
void cuda_gelu_forward_fp32(float *Y, const float *X, int N, cudaStream_t stream)
void cuda_gelu_forward_fp16(half *Y, const half *X, int N, cudaStream_t stream)
void cuda_encoder_forward_fp16(half *Y, const int *X, const half *wte, const half *wpe, int B, int T, int C, cudaStream_t stream)
void cuda_mha_forward_fp32(float *Y, float *qkvr, float *att, const float *X, int B, int T, int C, int NH, cudaStream_t stream)
void cuda_softmax_forward_general(TPrecision *Y, const TPrecision *X, int outer_size, int dim_size, int inner_size, cudaStream_t stream)
void cuda_encoder_forward_fp32(float *Y, const int *X, const float *wte, const float *wpe, int B, int T, int C, cudaStream_t stream)
void cuda_gelu_backward_fp32(float *dX, const float *X, const float *dY, const int N, cudaStream_t stream)
void cuda_gelu_backward_fp16(half *dX, const half *X, const half *dY, const int N, cudaStream_t stream)
void cuda_matmul_forward_fp16(half *Y, const half *X, const half *weight, const half *bias, int outer_size, int C, int OC, cudaStream_t stream)
void cuda_softmax_crossentropy_backward(TPrecision *dlogits, const TPrecision *dlosses, const TPrecision *probs, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)
void cuda_matmul_forward_fp32(float *Y, const float *X, const float *weight, const float *bias, int outer_size, int C, int OC, cudaStream_t stream)
void cuda_softmax_forward(TPrecision *Y, const TPrecision *X, int N, int C, cudaStream_t stream)
void cuda_layernorm_forward_fp32(float *Y, float *mean, float *rstd, const float *X, const float *weight, const float *bias, int B, int T, int C, float epsilon, cudaStream_t stream)
void cuda_residual_forward_fp32(float *Y, const float *X1, const float *X2, int N, cudaStream_t stream)
void cuda_softmax_crossentropy_forward(TPrecision *losses, TPrecision *probs, const TPrecision *logits, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)
void cuda_residual_forward_fp16(half *Y, const half *X1, const half *X2, int N, cudaStream_t stream)
void cuda_mha_forward_fp16(half *Y, half *qkvr, half *att, const half *X, int B, int T, int C, int NH, cudaStream_t stream)