JohannesGaessler commited on
Commit
a916e92
·
1 Parent(s): 4bf69ed

ggml-opt: fix data corruption (ggml/1022)

Browse files
ggml/src/ggml-backend.cpp CHANGED
@@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
252
  }
253
 
254
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
255
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
256
 
257
  if (size == 0) {
@@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
266
  }
267
 
268
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
269
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
270
 
271
  if (size == 0) {
 
252
  }
253
 
254
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
255
+ GGML_ASSERT(tensor);
256
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
257
 
258
  if (size == 0) {
 
267
  }
268
 
269
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
270
+ GGML_ASSERT(tensor);
271
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
272
 
273
  if (size == 0) {
ggml/src/ggml-impl.h CHANGED
@@ -295,6 +295,9 @@ struct ggml_cgraph {
295
  enum ggml_cgraph_eval_order order;
296
  };
297
 
 
 
 
298
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
299
 
300
  // Memory allocation
 
295
  enum ggml_cgraph_eval_order order;
296
  };
297
 
298
+ // returns a slice of cgraph with nodes [i0, i1)
299
+ // the slice does not have leafs or gradients
300
+ // if you need the gradients, get them from the original graph
301
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
302
 
303
  // Memory allocation
ggml/src/ggml-opt.cpp CHANGED
@@ -14,51 +14,51 @@
14
  #include <vector>
15
 
16
  struct ggml_opt_dataset {
17
- struct ggml_context * ctx;
18
- ggml_backend_buffer_t buf;
19
- struct ggml_tensor * data;
20
- struct ggml_tensor * labels;
21
 
22
- int64_t ndata;
23
- int64_t ndata_shard;
24
- size_t nbs_data;
25
- size_t nbs_labels;
26
 
27
  std::vector<int64_t> permutation;
28
  };
29
 
30
  struct ggml_opt_context {
31
- ggml_backend_sched_t backend_sched;
32
- ggml_cgraph * allocated_graph;
33
- ggml_cgraph * allocated_graph_copy;
34
- struct ggml_context * ctx_static;
35
- struct ggml_context * ctx_static_cpu;
36
- struct ggml_context * ctx_compute;
37
- struct ggml_context * ctx_copy;
38
- ggml_backend_buffer_t buf_static;
39
- ggml_backend_buffer_t buf_static_cpu;
40
  std::mt19937 rng;
41
 
42
- struct ggml_tensor * inputs;
43
- struct ggml_tensor * outputs;
44
- struct ggml_tensor * labels;
45
 
46
- struct ggml_tensor * loss;
47
- struct ggml_tensor * pred;
48
- struct ggml_tensor * ncorrect;
49
 
50
- struct ggml_cgraph * gf;
51
- struct ggml_cgraph * gb_grad;
52
- struct ggml_cgraph * gb_opt;
53
 
54
- int64_t iter;
55
- int32_t opt_period;
56
- int32_t opt_i;
57
- bool loss_per_datapoint;
58
 
59
- ggml_opt_get_optimizer_params get_opt_pars;
60
- void * get_opt_pars_ud;
61
- struct ggml_tensor * adamw_params;
62
  };
63
 
64
  struct ggml_opt_result {
@@ -67,8 +67,8 @@ struct ggml_opt_result {
67
  std::vector<int32_t> pred;
68
  int64_t ncorrect = 0;
69
 
70
- bool loss_per_datapoint = false;
71
- int64_t opt_period = -1;
72
  };
73
 
74
  // ====== Dataset ======
@@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
188
  }
189
 
190
  struct ggml_opt_params ggml_opt_default_params(
191
- ggml_backend_sched_t backend_sched,
192
- struct ggml_context * ctx_compute,
193
- struct ggml_tensor * inputs,
194
- struct ggml_tensor * outputs,
195
- enum ggml_opt_loss_type loss_type) {
196
  return {
197
  /*backend_sched =*/ backend_sched,
198
  /*ctx_compute =*/ ctx_compute,
@@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_
237
  return new_tensor;
238
  }
239
 
240
- static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) {
241
  std::map<ggml_tensor *, ggml_tensor *> tensor_map;
242
 
243
- ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true);
244
 
245
- for (int i = 0; i < graph->n_leafs; i++) {
246
- ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i]));
247
  }
248
- for (int i = 0; i < graph->n_nodes; i++) {
249
- ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i]));
 
250
  }
251
- for (int i = 0; i < graph->n_nodes; ++i) {
252
- const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]);
253
- const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]);
254
- graph->grads[igrad_dst] = new_graph->grads[igrad_src];
255
- graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src];
 
 
 
 
 
 
 
256
  }
257
 
258
- return new_graph;
259
  }
260
 
261
  static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
@@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph
284
 
285
  ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
286
  ggml_opt_context_t result = new struct ggml_opt_context;
287
- result->backend_sched = params.backend_sched;
288
- result->allocated_graph = nullptr;
289
- result->allocated_graph_copy = nullptr;
290
- result->ctx_compute = params.ctx_compute;
291
- result->ctx_copy = nullptr;
292
- result->inputs = params.inputs;
293
- result->outputs = params.outputs;
294
- result->iter = 1;
295
- result->opt_period = params.opt_period;
296
- result->opt_i = 0;
297
- result->get_opt_pars = params.get_opt_pars;
298
- result->get_opt_pars_ud = params.get_opt_pars_ud;
299
 
300
  GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
301
  GGML_ASSERT(result->opt_period >= 1);
@@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
348
 
349
  switch (params.loss_type) {
350
  case GGML_OPT_LOSS_TYPE_MEAN: {
351
- result->labels = nullptr;
352
  result->loss = ggml_sum(result->ctx_static, result->outputs);
353
  ggml_set_name(result->loss, "loss_sum");
354
  const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
@@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
358
  break;
359
  }
360
  case GGML_OPT_LOSS_TYPE_SUM: {
361
- result->labels = nullptr;
362
  result->loss = ggml_sum(result->ctx_static, result->outputs);
363
  ggml_set_name(result->loss, "loss_sum");
364
  result->loss_per_datapoint = false;
@@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
413
  }
414
 
415
  if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
416
- result->gb_grad = nullptr;
417
- result->gb_opt = nullptr;
418
-
419
  result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
420
- result->buf_static_cpu = nullptr;
421
-
422
- ggml_opt_alloc_graph(result, result->gf);
423
-
424
  return result;
425
  }
426
 
@@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
429
  ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
430
 
431
  if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
432
- result->gb_opt = nullptr;
433
-
434
  result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
435
- result->buf_static_cpu = nullptr;
436
-
437
- ggml_opt_alloc_graph(result, result->gb_grad);
438
  ggml_graph_reset(result->gb_grad);
439
-
440
  return result;
441
  }
442
 
@@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
466
 
467
  result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
468
 
469
- ggml_opt_alloc_graph(result, result->gb_opt);
470
  ggml_graph_reset(result->gb_opt);
471
 
472
  return result;
 
14
  #include <vector>
15
 
16
  struct ggml_opt_dataset {
17
+ struct ggml_context * ctx = nullptr;
18
+ ggml_backend_buffer_t buf = nullptr;
19
+ struct ggml_tensor * data = nullptr;
20
+ struct ggml_tensor * labels = nullptr;
21
 
22
+ int64_t ndata = -1;
23
+ int64_t ndata_shard = -1;
24
+ size_t nbs_data = -1;
25
+ size_t nbs_labels = -1;
26
 
27
  std::vector<int64_t> permutation;
28
  };
29
 
30
  struct ggml_opt_context {
31
+ ggml_backend_sched_t backend_sched = nullptr;
32
+ ggml_cgraph * allocated_graph = nullptr;
33
+ ggml_cgraph * allocated_graph_copy = nullptr;
34
+ struct ggml_context * ctx_static = nullptr;
35
+ struct ggml_context * ctx_static_cpu = nullptr;
36
+ struct ggml_context * ctx_compute = nullptr;
37
+ struct ggml_context * ctx_copy = nullptr;
38
+ ggml_backend_buffer_t buf_static = nullptr;
39
+ ggml_backend_buffer_t buf_static_cpu = nullptr;
40
  std::mt19937 rng;
41
 
42
+ struct ggml_tensor * inputs = nullptr;
43
+ struct ggml_tensor * outputs = nullptr;
44
+ struct ggml_tensor * labels = nullptr;
45
 
46
+ struct ggml_tensor * loss = nullptr;
47
+ struct ggml_tensor * pred = nullptr;
48
+ struct ggml_tensor * ncorrect = nullptr;
49
 
50
+ struct ggml_cgraph * gf = nullptr;
51
+ struct ggml_cgraph * gb_grad = nullptr;
52
+ struct ggml_cgraph * gb_opt = nullptr;
53
 
54
+ int64_t iter = 1;
55
+ int32_t opt_period = 1;
56
+ int32_t opt_i = 0;
57
+ bool loss_per_datapoint = false;
58
 
59
+ ggml_opt_get_optimizer_params get_opt_pars = nullptr;
60
+ void * get_opt_pars_ud = nullptr;
61
+ struct ggml_tensor * adamw_params = nullptr;
62
  };
63
 
64
  struct ggml_opt_result {
 
67
  std::vector<int32_t> pred;
68
  int64_t ncorrect = 0;
69
 
70
+ int64_t opt_period = -1;
71
+ bool loss_per_datapoint = false;
72
  };
73
 
74
  // ====== Dataset ======
 
188
  }
189
 
190
  struct ggml_opt_params ggml_opt_default_params(
191
+ ggml_backend_sched_t backend_sched,
192
+ struct ggml_context * ctx_compute,
193
+ struct ggml_tensor * inputs,
194
+ struct ggml_tensor * outputs,
195
+ enum ggml_opt_loss_type loss_type) {
196
  return {
197
  /*backend_sched =*/ backend_sched,
198
  /*ctx_compute =*/ ctx_compute,
 
237
  return new_tensor;
238
  }
239
 
240
+ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
241
  std::map<ggml_tensor *, ggml_tensor *> tensor_map;
242
 
243
+ ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);
244
 
245
+ for (int i = 0; i < src->n_leafs; i++) {
246
+ ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
247
  }
248
+ GGML_ASSERT(dst->n_leafs == src->n_leafs);
249
+ for (int i = 0; i < src->n_nodes; i++) {
250
+ ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
251
  }
252
+ GGML_ASSERT(dst->n_nodes == src->n_nodes);
253
+ for (int i = 0; i < src->n_nodes; ++i) {
254
+ const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
255
+ const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
256
+
257
+ GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
258
+ GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
259
+ GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
260
+ GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
261
+
262
+ dst->grads[igrad_dst] = src->grads[igrad_src];
263
+ dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
264
  }
265
 
266
+ return dst;
267
  }
268
 
269
  static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
 
292
 
293
  ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
294
  ggml_opt_context_t result = new struct ggml_opt_context;
295
+ result->backend_sched = params.backend_sched;
296
+ result->ctx_compute = params.ctx_compute;
297
+ result->inputs = params.inputs;
298
+ result->outputs = params.outputs;
299
+ result->opt_period = params.opt_period;
300
+ result->get_opt_pars = params.get_opt_pars;
301
+ result->get_opt_pars_ud = params.get_opt_pars_ud;
 
 
 
 
 
302
 
303
  GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
304
  GGML_ASSERT(result->opt_period >= 1);
 
351
 
352
  switch (params.loss_type) {
353
  case GGML_OPT_LOSS_TYPE_MEAN: {
 
354
  result->loss = ggml_sum(result->ctx_static, result->outputs);
355
  ggml_set_name(result->loss, "loss_sum");
356
  const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
 
360
  break;
361
  }
362
  case GGML_OPT_LOSS_TYPE_SUM: {
 
363
  result->loss = ggml_sum(result->ctx_static, result->outputs);
364
  ggml_set_name(result->loss, "loss_sum");
365
  result->loss_per_datapoint = false;
 
414
  }
415
 
416
  if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
 
 
 
417
  result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
 
 
 
 
418
  return result;
419
  }
420
 
 
423
  ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
424
 
425
  if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
 
 
426
  result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
 
 
 
427
  ggml_graph_reset(result->gb_grad);
 
428
  return result;
429
  }
430
 
 
454
 
455
  result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
456
 
 
457
  ggml_graph_reset(result->gb_opt);
458
 
459
  return result;
ggml/src/ggml.c CHANGED
@@ -5019,8 +5019,10 @@ static void ggml_hash_map_free(struct hash_map * map) {
5019
  }
5020
 
5021
  // utility functions to change gradients
5022
- // if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
5023
- // else if a is in zero_table, replace a
 
 
5024
  // else, just add/subtract/etc. the gradients
5025
 
5026
  static void ggml_add_or_set(
@@ -5028,11 +5030,14 @@ static void ggml_add_or_set(
5028
  struct ggml_cgraph * cgraph,
5029
  size_t isrc,
5030
  struct ggml_tensor * tensor) {
 
 
5031
  if (cgraph->grads[isrc]) {
5032
- cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5033
  } else {
5034
  cgraph->grads[isrc] = tensor;
5035
  }
 
5036
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5037
  }
5038
 
@@ -5040,18 +5045,20 @@ static void ggml_acc_or_set(
5040
  struct ggml_context * ctx,
5041
  struct ggml_cgraph * cgraph,
5042
  size_t isrc,
5043
- struct ggml_tensor * src,
5044
  struct ggml_tensor * tensor,
5045
  const size_t nb1,
5046
  const size_t nb2,
5047
  const size_t nb3,
5048
  const size_t offset) {
 
 
5049
  if (cgraph->grads[isrc]) {
5050
  cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
5051
  } else {
5052
  struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
5053
  cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
5054
  }
 
5055
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5056
  }
5057
 
@@ -5059,13 +5066,15 @@ static void ggml_add1_or_set(
5059
  struct ggml_context * ctx,
5060
  struct ggml_cgraph * cgraph,
5061
  size_t isrc,
5062
- struct ggml_tensor * src,
5063
  struct ggml_tensor * tensor) {
 
 
5064
  if (cgraph->grads[isrc]) {
5065
  cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5066
  } else {
5067
  cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
5068
  }
 
5069
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5070
  }
5071
 
@@ -5074,11 +5083,14 @@ static void ggml_sub_or_set(
5074
  struct ggml_cgraph * cgraph,
5075
  size_t isrc,
5076
  struct ggml_tensor * tensor) {
 
 
5077
  if (cgraph->grads[isrc]) {
5078
  cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5079
  } else {
5080
  cgraph->grads[isrc] = ggml_neg(ctx, tensor);
5081
  }
 
5082
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5083
  }
5084
 
@@ -5095,12 +5107,12 @@ static void ggml_compute_backward(
5095
  struct ggml_tensor * src1 = tensor->src[1];
5096
  struct ggml_tensor * src2 = tensor->src[2];
5097
  struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
5098
- const size_t isrc0 = ggml_hash_find(hash_set, src0);
5099
- const size_t isrc1 = ggml_hash_find(hash_set, src1);
5100
- const size_t isrc2 = ggml_hash_find(hash_set, src2);
5101
- const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
5102
- const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
5103
- const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
5104
 
5105
  switch (tensor->op) {
5106
  case GGML_OP_DUP: {
@@ -5200,7 +5212,7 @@ static void ggml_compute_backward(
5200
  } break;
5201
  case GGML_OP_SUM: {
5202
  if (src0_needs_grads) {
5203
- ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad);
5204
  }
5205
  } break;
5206
  case GGML_OP_SUM_ROWS: {
@@ -5210,7 +5222,7 @@ static void ggml_compute_backward(
5210
  } break;
5211
  case GGML_OP_MEAN: {
5212
  if (src0_needs_grads) {
5213
- ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
5214
  }
5215
  } break;
5216
  case GGML_OP_REPEAT: {
@@ -5363,7 +5375,7 @@ static void ggml_compute_backward(
5363
  nb3 = (nb3 / n0) * ng;
5364
  }
5365
 
5366
- ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset);
5367
  }
5368
  } break;
5369
  case GGML_OP_PERMUTE: {
@@ -5597,10 +5609,9 @@ void ggml_build_backward_expand(
5597
 
5598
  const int n_nodes_f = cgraph->n_nodes;
5599
 
5600
- const size_t hash_size = ggml_hash_size(2*cgraph->size);
5601
- memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *));
5602
- memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
5603
- bool * grads_needed = calloc(hash_size, sizeof(bool));
5604
 
5605
  {
5606
  bool any_params = false;
@@ -5621,7 +5632,7 @@ void ggml_build_backward_expand(
5621
  continue;
5622
  }
5623
 
5624
- bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
5625
  bool ignore_src[GGML_MAX_SRC] = {false};
5626
  switch (node->op) {
5627
  // gradients in node->src[0] for one reason or another have no effect on output gradients
@@ -5638,7 +5649,7 @@ void ggml_build_backward_expand(
5638
  } break;
5639
 
5640
  // gradients in node->src[1] for one reason or another have no effect on output gradients
5641
- case GGML_OP_CPY: // gradients in CPY target are irrelevant
5642
  case GGML_OP_GET_ROWS: // row indices not differentiable
5643
  case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
5644
  case GGML_OP_ROPE: // positions not differentiable
@@ -5665,9 +5676,12 @@ void ggml_build_backward_expand(
5665
  node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
5666
 
5667
  const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
 
 
5668
  if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
5669
- cgraph->grads[igrad] = ggml_dup_tensor(ctx_static, node);
5670
- cgraph->grad_accs[igrad] = cgraph->grads[igrad];
 
5671
  }
5672
  grads_needed[igrad] = true;
5673
  }
@@ -5761,15 +5775,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
5761
 
5762
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
5763
  struct ggml_cgraph cgraph = {
5764
- /*.size =*/ 0,
5765
- /*.n_nodes =*/ i1 - i0,
5766
- /*.n_leafs =*/ 0,
5767
- /*.nodes =*/ cgraph0->nodes + i0,
5768
- /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
5769
- /*.grad_accs =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL,
5770
- /*.leafs =*/ NULL,
5771
- /*.hash_table =*/ { 0, NULL, NULL },
5772
- /*.order =*/ cgraph0->order,
5773
  };
5774
 
5775
  return cgraph;
@@ -5799,12 +5813,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
5799
  }
5800
  }
5801
 
 
 
 
 
5802
  if (src->grads) {
5803
  GGML_ASSERT(dst->grads != NULL);
5804
  GGML_ASSERT(dst->grad_accs != NULL);
5805
  for (int i = 0; i < src->n_nodes; ++i) {
5806
  const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
5807
  const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
 
 
 
 
 
 
5808
  dst->grads[igrad_dst] = src->grads[igrad_src];
5809
  dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
5810
  }
@@ -5839,12 +5863,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
5839
 
5840
  if (node->op == GGML_OP_OPT_STEP_ADAMW) {
5841
  // clear momenta
5842
- if (node->src[2]->data) {
5843
- ggml_set_zero(node->src[2]);
5844
- }
5845
- if (node->src[3]->data) {
5846
- ggml_set_zero(node->src[3]);
5847
- }
5848
  }
5849
 
5850
  // initial gradients of loss should be 1, 0 otherwise
 
5019
  }
5020
 
5021
  // utility functions to change gradients
5022
+ // isrc is the index of tensor in cgraph->visited_has_set.keys
5023
+ // the corresponding gradient (accumulators) are also at position isrc
5024
+ // if tensor has a gradient accumulator, modify that accumulator in-place
5025
+ // else if there is no gradient for tensor, set the corresponding value
5026
  // else, just add/subtract/etc. the gradients
5027
 
5028
  static void ggml_add_or_set(
 
5030
  struct ggml_cgraph * cgraph,
5031
  size_t isrc,
5032
  struct ggml_tensor * tensor) {
5033
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5034
+ GGML_ASSERT(src);
5035
  if (cgraph->grads[isrc]) {
5036
+ cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
5037
  } else {
5038
  cgraph->grads[isrc] = tensor;
5039
  }
5040
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
5041
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5042
  }
5043
 
 
5045
  struct ggml_context * ctx,
5046
  struct ggml_cgraph * cgraph,
5047
  size_t isrc,
 
5048
  struct ggml_tensor * tensor,
5049
  const size_t nb1,
5050
  const size_t nb2,
5051
  const size_t nb3,
5052
  const size_t offset) {
5053
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5054
+ GGML_ASSERT(src);
5055
  if (cgraph->grads[isrc]) {
5056
  cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
5057
  } else {
5058
  struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
5059
  cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
5060
  }
5061
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
5062
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5063
  }
5064
 
 
5066
  struct ggml_context * ctx,
5067
  struct ggml_cgraph * cgraph,
5068
  size_t isrc,
 
5069
  struct ggml_tensor * tensor) {
5070
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5071
+ GGML_ASSERT(src);
5072
  if (cgraph->grads[isrc]) {
5073
  cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5074
  } else {
5075
  cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
5076
  }
5077
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
5078
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5079
  }
5080
 
 
5083
  struct ggml_cgraph * cgraph,
5084
  size_t isrc,
5085
  struct ggml_tensor * tensor) {
5086
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5087
+ GGML_ASSERT(src);
5088
  if (cgraph->grads[isrc]) {
5089
  cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5090
  } else {
5091
  cgraph->grads[isrc] = ggml_neg(ctx, tensor);
5092
  }
5093
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
5094
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5095
  }
5096
 
 
5107
  struct ggml_tensor * src1 = tensor->src[1];
5108
  struct ggml_tensor * src2 = tensor->src[2];
5109
  struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
5110
+ const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
5111
+ const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
5112
+ const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
5113
+ const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
5114
+ const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
5115
+ const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
5116
 
5117
  switch (tensor->op) {
5118
  case GGML_OP_DUP: {
 
5212
  } break;
5213
  case GGML_OP_SUM: {
5214
  if (src0_needs_grads) {
5215
+ ggml_add1_or_set(ctx, cgraph, isrc0, grad);
5216
  }
5217
  } break;
5218
  case GGML_OP_SUM_ROWS: {
 
5222
  } break;
5223
  case GGML_OP_MEAN: {
5224
  if (src0_needs_grads) {
5225
+ ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
5226
  }
5227
  } break;
5228
  case GGML_OP_REPEAT: {
 
5375
  nb3 = (nb3 / n0) * ng;
5376
  }
5377
 
5378
+ ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
5379
  }
5380
  } break;
5381
  case GGML_OP_PERMUTE: {
 
5609
 
5610
  const int n_nodes_f = cgraph->n_nodes;
5611
 
5612
+ memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
5613
+ memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
5614
+ bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
 
5615
 
5616
  {
5617
  bool any_params = false;
 
5632
  continue;
5633
  }
5634
 
5635
+ bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
5636
  bool ignore_src[GGML_MAX_SRC] = {false};
5637
  switch (node->op) {
5638
  // gradients in node->src[0] for one reason or another have no effect on output gradients
 
5649
  } break;
5650
 
5651
  // gradients in node->src[1] for one reason or another have no effect on output gradients
5652
+ case GGML_OP_CPY: // gradients in CPY target are irrelevant
5653
  case GGML_OP_GET_ROWS: // row indices not differentiable
5654
  case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
5655
  case GGML_OP_ROPE: // positions not differentiable
 
5676
  node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
5677
 
5678
  const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
5679
+ GGML_ASSERT(igrad != GGML_HASHSET_FULL);
5680
+ GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
5681
  if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
5682
+ cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
5683
+ cgraph->grads[igrad] = cgraph->grad_accs[igrad];
5684
+ ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
5685
  }
5686
  grads_needed[igrad] = true;
5687
  }
 
5775
 
5776
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
5777
  struct ggml_cgraph cgraph = {
5778
+ /*.size =*/ 0,
5779
+ /*.n_nodes =*/ i1 - i0,
5780
+ /*.n_leafs =*/ 0,
5781
+ /*.nodes =*/ cgraph0->nodes + i0,
5782
+ /*.grads =*/ NULL, // gradients would need visited_hash_set
5783
+ /*.grad_accs =*/ NULL,
5784
+ /*.leafs =*/ NULL,
5785
+ /*.visited_hash_set =*/ { 0, NULL, NULL },
5786
+ /*.order =*/ cgraph0->order,
5787
  };
5788
 
5789
  return cgraph;
 
5813
  }
5814
  }
5815
 
5816
+ if (dst->grads) {
5817
+ memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
5818
+ memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
5819
+ }
5820
  if (src->grads) {
5821
  GGML_ASSERT(dst->grads != NULL);
5822
  GGML_ASSERT(dst->grad_accs != NULL);
5823
  for (int i = 0; i < src->n_nodes; ++i) {
5824
  const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
5825
  const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
5826
+
5827
+ GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
5828
+ GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
5829
+ GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
5830
+ GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
5831
+
5832
  dst->grads[igrad_dst] = src->grads[igrad_src];
5833
  dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
5834
  }
 
5863
 
5864
  if (node->op == GGML_OP_OPT_STEP_ADAMW) {
5865
  // clear momenta
5866
+ ggml_set_zero(node->src[2]);
5867
+ ggml_set_zero(node->src[3]);
 
 
 
 
5868
  }
5869
 
5870
  // initial gradients of loss should be 1, 0 otherwise