slaren ggerganov commited on
Commit
44164ac
·
unverified ·
1 Parent(s): 0206c2d

cuda, metal : fix nans in soft_max (llama/5574)

Browse files

* cuda : fix nans in soft_max

* metal : fix nans in soft_max

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (2) hide show
  1. ggml-cuda.cu +4 -4
  2. ggml-metal.metal +4 -4
ggml-cuda.cu CHANGED
@@ -6205,7 +6205,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
6205
  const int ix = rowx*ncols + col;
6206
  const int iy = rowy*ncols + col;
6207
 
6208
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + slope*pos[col];
6209
 
6210
  vals[col] = val;
6211
  max_val = max(max_val, val);
@@ -9170,17 +9170,17 @@ static void ggml_cuda_op_soft_max(
9170
  memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
9171
 
9172
  // positions tensor
9173
- float * src2_dd = dst_dd; // default to avoid null checks in the kernel
9174
  cuda_pool_alloc<float> src2_f;
9175
 
9176
  ggml_tensor * src2 = dst->src[2];
9177
  const bool use_src2 = src2 != nullptr;
9178
 
9179
  if (use_src2) {
9180
- const bool src2_on_device = use_src2 && src2->backend == GGML_BACKEND_GPU;
9181
- ggml_tensor_extra_gpu * src2_extra = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
9182
 
9183
  if (src2_on_device) {
 
9184
  src2_dd = (float *) src2_extra->data_device[g_main_device];
9185
  } else {
9186
  src2_dd = src2_f.alloc(ggml_nelements(src2));
 
6205
  const int ix = rowx*ncols + col;
6206
  const int iy = rowy*ncols + col;
6207
 
6208
+ const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
6209
 
6210
  vals[col] = val;
6211
  max_val = max(max_val, val);
 
9170
  memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
9171
 
9172
  // positions tensor
9173
+ float * src2_dd = nullptr;
9174
  cuda_pool_alloc<float> src2_f;
9175
 
9176
  ggml_tensor * src2 = dst->src[2];
9177
  const bool use_src2 = src2 != nullptr;
9178
 
9179
  if (use_src2) {
9180
+ const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
 
9181
 
9182
  if (src2_on_device) {
9183
+ ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
9184
  src2_dd = (float *) src2_extra->data_device[g_main_device];
9185
  } else {
9186
  src2_dd = src2_f.alloc(ggml_nelements(src2));
ggml-metal.metal CHANGED
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
392
  float lmax = -INFINITY;
393
 
394
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
395
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
396
  }
397
 
398
  // find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
417
  // parallel sum
418
  float lsum = 0.0f;
419
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
420
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
421
  lsum += exp_psrc0;
422
  pdst[i00] = exp_psrc0;
423
  }
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
495
  float4 lmax4 = -INFINITY;
496
 
497
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
498
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
499
  }
500
 
501
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
521
  // parallel sum
522
  float4 lsum4 = 0.0f;
523
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
524
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
525
  lsum4 += exp_psrc4;
526
  pdst4[i00] = exp_psrc4;
527
  }
 
392
  float lmax = -INFINITY;
393
 
394
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
395
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
396
  }
397
 
398
  // find the max value in the block
 
417
  // parallel sum
418
  float lsum = 0.0f;
419
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
420
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
421
  lsum += exp_psrc0;
422
  pdst[i00] = exp_psrc0;
423
  }
 
495
  float4 lmax4 = -INFINITY;
496
 
497
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
498
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
499
  }
500
 
501
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
 
521
  // parallel sum
522
  float4 lsum4 = 0.0f;
523
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
524
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
525
  lsum4 += exp_psrc4;
526
  pdst4[i00] = exp_psrc4;
527
  }