Mila
Deep Neural Network Library
Loading...
Searching...
No Matches
CudaOps.h
Go to the documentation of this file.
1
22#pragma once
23
24#include <cublasLt.h>
25#include <cuda_runtime.h>
26#include <cuda_fp16.h>
27
28namespace Mila::Dnn::Compute
29{
30 // Attention functions
32 float* Y,
33 float* qkvr, float* att,
34 const float* X,
35 int B, int T, int C, int NH,
36 cudaStream_t stream );
37
39 half* Y,
40 half* qkvr, half* att,
41 const half* X,
42 int B, int T, int C, int NH,
43 cudaStream_t stream );
44
45 // Encoder functions
47 float* Y, const int* X,
48 const float* wte, const float* wpe,
49 int B, int T, int C,
50 cudaStream_t stream );
51
53 half* Y, const int* X,
54 const half* wte, const half* wpe,
55 int B, int T, int C,
56 cudaStream_t stream );
57
58 // GELU functions
60 float* Y,
61 const float* X,
62 int N,
63 cudaStream_t stream );
64
66 float* dX,
67 const float* X,
68 const float* dY,
69 const int N,
70 cudaStream_t stream );
71
73 half* Y,
74 const half* X,
75 int N,
76 cudaStream_t stream );
77
79 half* dX,
80 const half* X,
81 const half* dY,
82 const int N,
83 cudaStream_t stream );
84
85 // LayerNorm functions
87 float* Y,
88 float* mean, float* rstd,
89 const float* X,
90 const float* weight, const float* bias,
91 int B, int T, int C, float epsilon,
92 cudaStream_t stream );
93
95 half* Y,
96 half* mean, half* rstd,
97 const half* X,
98 const half* weight, const half* bias,
99 int B, int T, int C, float epsilon,
100 cudaStream_t stream );
101
102 // Matmul functions
104 float* Y,
105 const float* X,
106 const float* weight, const float* bias,
107 int outer_size, int C, int OC,
108 cudaStream_t stream );
109
111 half* Y, const half* X,
112 const half* weight, const half* bias,
113 int outer_size, int C, int OC,
114 cudaStream_t stream );
115
116 // Softmax functions
117 template <typename TPrecision>
119 TPrecision* Y,
120 const TPrecision* X,
121 int N,
122 int C,
123 cudaStream_t stream );
124
125 template <typename TPrecision>
127 TPrecision* Y,
128 const TPrecision* X,
129 int outer_size,
130 int dim_size,
131 int inner_size,
132 cudaStream_t stream );
133
134 // SoftmaxCrossEntropy functions
135 template <typename TPrecision>
137 TPrecision* losses,
138 TPrecision* probs,
139 const TPrecision* logits,
140 const int* targets,
141 int batch_size,
142 int seq_len,
143 int vocab_size,
144 cudaStream_t stream );
145
146 template <typename TPrecision>
148 TPrecision* dlogits,
149 const TPrecision* dlosses,
150 const TPrecision* probs,
151 const int* targets,
152 int batch_size,
153 int seq_len,
154 int vocab_size,
155 cudaStream_t stream );
156
157 // Residual functions
159 float* Y,
160 const float* X1, const float* X2,
161 int N,
162 cudaStream_t stream );
163
165 half* Y,
166 const half* X1, const half* X2,
167 int N,
168 cudaStream_t stream );
169
170
171}
Definition PrecisionConfig.ixx:13
void cuda_layernorm_forward_fp16(half *Y, half *mean, half *rstd, const half *X, const half *weight, const half *bias, int B, int T, int C, float epsilon, cudaStream_t stream)
void cuda_gelu_forward_fp32(float *Y, const float *X, int N, cudaStream_t stream)
void cuda_gelu_forward_fp16(half *Y, const half *X, int N, cudaStream_t stream)
void cuda_encoder_forward_fp16(half *Y, const int *X, const half *wte, const half *wpe, int B, int T, int C, cudaStream_t stream)
void cuda_mha_forward_fp32(float *Y, float *qkvr, float *att, const float *X, int B, int T, int C, int NH, cudaStream_t stream)
void cuda_softmax_forward_general(TPrecision *Y, const TPrecision *X, int outer_size, int dim_size, int inner_size, cudaStream_t stream)
void cuda_encoder_forward_fp32(float *Y, const int *X, const float *wte, const float *wpe, int B, int T, int C, cudaStream_t stream)
void cuda_gelu_backward_fp32(float *dX, const float *X, const float *dY, const int N, cudaStream_t stream)
void cuda_gelu_backward_fp16(half *dX, const half *X, const half *dY, const int N, cudaStream_t stream)
void cuda_matmul_forward_fp16(half *Y, const half *X, const half *weight, const half *bias, int outer_size, int C, int OC, cudaStream_t stream)
void cuda_softmax_crossentropy_backward(TPrecision *dlogits, const TPrecision *dlosses, const TPrecision *probs, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)
void cuda_matmul_forward_fp32(float *Y, const float *X, const float *weight, const float *bias, int outer_size, int C, int OC, cudaStream_t stream)
void cuda_softmax_forward(TPrecision *Y, const TPrecision *X, int N, int C, cudaStream_t stream)
void cuda_layernorm_forward_fp32(float *Y, float *mean, float *rstd, const float *X, const float *weight, const float *bias, int B, int T, int C, float epsilon, cudaStream_t stream)
void cuda_residual_forward_fp32(float *Y, const float *X1, const float *X2, int N, cudaStream_t stream)
void cuda_softmax_crossentropy_forward(TPrecision *losses, TPrecision *probs, const TPrecision *logits, const int *targets, int batch_size, int seq_len, int vocab_size, cudaStream_t stream)
void cuda_residual_forward_fp16(half *Y, const half *X1, const half *X2, int N, cudaStream_t stream)
void cuda_mha_forward_fp16(half *Y, half *qkvr, half *att, const half *X, int B, int T, int C, int NH, cudaStream_t stream)