Spaces:
Running
Running
cuda, metal : fix nans in soft_max (llama/5574)
Browse files* cuda : fix nans in soft_max
* metal : fix nans in soft_max
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml-cuda.cu +4 -4
- ggml-metal.metal +4 -4
ggml-cuda.cu
CHANGED
|
@@ -6205,7 +6205,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|
| 6205 |
const int ix = rowx*ncols + col;
|
| 6206 |
const int iy = rowy*ncols + col;
|
| 6207 |
|
| 6208 |
-
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + slope*pos[col];
|
| 6209 |
|
| 6210 |
vals[col] = val;
|
| 6211 |
max_val = max(max_val, val);
|
|
@@ -9170,17 +9170,17 @@ static void ggml_cuda_op_soft_max(
|
|
| 9170 |
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
| 9171 |
|
| 9172 |
// positions tensor
|
| 9173 |
-
float * src2_dd =
|
| 9174 |
cuda_pool_alloc<float> src2_f;
|
| 9175 |
|
| 9176 |
ggml_tensor * src2 = dst->src[2];
|
| 9177 |
const bool use_src2 = src2 != nullptr;
|
| 9178 |
|
| 9179 |
if (use_src2) {
|
| 9180 |
-
const bool src2_on_device =
|
| 9181 |
-
ggml_tensor_extra_gpu * src2_extra = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
| 9182 |
|
| 9183 |
if (src2_on_device) {
|
|
|
|
| 9184 |
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
| 9185 |
} else {
|
| 9186 |
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
|
|
|
| 6205 |
const int ix = rowx*ncols + col;
|
| 6206 |
const int iy = rowy*ncols + col;
|
| 6207 |
|
| 6208 |
+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
| 6209 |
|
| 6210 |
vals[col] = val;
|
| 6211 |
max_val = max(max_val, val);
|
|
|
|
| 9170 |
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
| 9171 |
|
| 9172 |
// positions tensor
|
| 9173 |
+
float * src2_dd = nullptr;
|
| 9174 |
cuda_pool_alloc<float> src2_f;
|
| 9175 |
|
| 9176 |
ggml_tensor * src2 = dst->src[2];
|
| 9177 |
const bool use_src2 = src2 != nullptr;
|
| 9178 |
|
| 9179 |
if (use_src2) {
|
| 9180 |
+
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
|
|
|
| 9181 |
|
| 9182 |
if (src2_on_device) {
|
| 9183 |
+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
| 9184 |
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
| 9185 |
} else {
|
| 9186 |
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
ggml-metal.metal
CHANGED
|
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
|
|
| 392 |
float lmax = -INFINITY;
|
| 393 |
|
| 394 |
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 395 |
-
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
| 396 |
}
|
| 397 |
|
| 398 |
// find the max value in the block
|
|
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
|
|
| 417 |
// parallel sum
|
| 418 |
float lsum = 0.0f;
|
| 419 |
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 420 |
-
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
| 421 |
lsum += exp_psrc0;
|
| 422 |
pdst[i00] = exp_psrc0;
|
| 423 |
}
|
|
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
|
|
| 495 |
float4 lmax4 = -INFINITY;
|
| 496 |
|
| 497 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
| 498 |
-
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
| 499 |
}
|
| 500 |
|
| 501 |
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
|
|
| 521 |
// parallel sum
|
| 522 |
float4 lsum4 = 0.0f;
|
| 523 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
| 524 |
-
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
| 525 |
lsum4 += exp_psrc4;
|
| 526 |
pdst4[i00] = exp_psrc4;
|
| 527 |
}
|
|
|
|
| 392 |
float lmax = -INFINITY;
|
| 393 |
|
| 394 |
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 395 |
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
| 396 |
}
|
| 397 |
|
| 398 |
// find the max value in the block
|
|
|
|
| 417 |
// parallel sum
|
| 418 |
float lsum = 0.0f;
|
| 419 |
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 420 |
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
| 421 |
lsum += exp_psrc0;
|
| 422 |
pdst[i00] = exp_psrc0;
|
| 423 |
}
|
|
|
|
| 495 |
float4 lmax4 = -INFINITY;
|
| 496 |
|
| 497 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
| 498 |
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
| 499 |
}
|
| 500 |
|
| 501 |
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
|
|
| 521 |
// parallel sum
|
| 522 |
float4 lsum4 = 0.0f;
|
| 523 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
| 524 |
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
| 525 |
lsum4 += exp_psrc4;
|
| 526 |
pdst4[i00] = exp_psrc4;
|
| 527 |
}
|