Spaces:
Running
Running
Commit
·
a916e92
1
Parent(s):
4bf69ed
ggml-opt: fix data corruption (ggml/1022)
Browse files- ggml/src/ggml-backend.cpp +2 -0
- ggml/src/ggml-impl.h +3 -0
- ggml/src/ggml-opt.cpp +67 -80
- ggml/src/ggml.c +57 -37
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
| 252 |
}
|
| 253 |
|
| 254 |
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
| 255 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 256 |
|
| 257 |
if (size == 0) {
|
|
@@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
|
|
| 266 |
}
|
| 267 |
|
| 268 |
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
| 269 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 270 |
|
| 271 |
if (size == 0) {
|
|
|
|
| 252 |
}
|
| 253 |
|
| 254 |
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 255 |
+
GGML_ASSERT(tensor);
|
| 256 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 257 |
|
| 258 |
if (size == 0) {
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 270 |
+
GGML_ASSERT(tensor);
|
| 271 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 272 |
|
| 273 |
if (size == 0) {
|
ggml/src/ggml-impl.h
CHANGED
|
@@ -295,6 +295,9 @@ struct ggml_cgraph {
|
|
| 295 |
enum ggml_cgraph_eval_order order;
|
| 296 |
};
|
| 297 |
|
|
|
|
|
|
|
|
|
|
| 298 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
| 299 |
|
| 300 |
// Memory allocation
|
|
|
|
| 295 |
enum ggml_cgraph_eval_order order;
|
| 296 |
};
|
| 297 |
|
| 298 |
+
// returns a slice of cgraph with nodes [i0, i1)
|
| 299 |
+
// the slice does not have leafs or gradients
|
| 300 |
+
// if you need the gradients, get them from the original graph
|
| 301 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
| 302 |
|
| 303 |
// Memory allocation
|
ggml/src/ggml-opt.cpp
CHANGED
|
@@ -14,51 +14,51 @@
|
|
| 14 |
#include <vector>
|
| 15 |
|
| 16 |
struct ggml_opt_dataset {
|
| 17 |
-
struct ggml_context * ctx;
|
| 18 |
-
ggml_backend_buffer_t buf;
|
| 19 |
-
struct ggml_tensor * data;
|
| 20 |
-
struct ggml_tensor * labels;
|
| 21 |
|
| 22 |
-
int64_t ndata;
|
| 23 |
-
int64_t ndata_shard;
|
| 24 |
-
size_t nbs_data;
|
| 25 |
-
size_t nbs_labels;
|
| 26 |
|
| 27 |
std::vector<int64_t> permutation;
|
| 28 |
};
|
| 29 |
|
| 30 |
struct ggml_opt_context {
|
| 31 |
-
ggml_backend_sched_t backend_sched;
|
| 32 |
-
ggml_cgraph * allocated_graph;
|
| 33 |
-
ggml_cgraph * allocated_graph_copy;
|
| 34 |
-
struct ggml_context * ctx_static;
|
| 35 |
-
struct ggml_context * ctx_static_cpu;
|
| 36 |
-
struct ggml_context * ctx_compute;
|
| 37 |
-
struct ggml_context * ctx_copy;
|
| 38 |
-
ggml_backend_buffer_t buf_static;
|
| 39 |
-
ggml_backend_buffer_t buf_static_cpu;
|
| 40 |
std::mt19937 rng;
|
| 41 |
|
| 42 |
-
struct ggml_tensor * inputs;
|
| 43 |
-
struct ggml_tensor * outputs;
|
| 44 |
-
struct ggml_tensor * labels;
|
| 45 |
|
| 46 |
-
struct ggml_tensor * loss;
|
| 47 |
-
struct ggml_tensor * pred;
|
| 48 |
-
struct ggml_tensor * ncorrect;
|
| 49 |
|
| 50 |
-
struct ggml_cgraph * gf;
|
| 51 |
-
struct ggml_cgraph * gb_grad;
|
| 52 |
-
struct ggml_cgraph * gb_opt;
|
| 53 |
|
| 54 |
-
int64_t iter;
|
| 55 |
-
int32_t opt_period;
|
| 56 |
-
int32_t opt_i;
|
| 57 |
-
bool loss_per_datapoint;
|
| 58 |
|
| 59 |
-
ggml_opt_get_optimizer_params get_opt_pars;
|
| 60 |
-
void * get_opt_pars_ud;
|
| 61 |
-
struct ggml_tensor * adamw_params;
|
| 62 |
};
|
| 63 |
|
| 64 |
struct ggml_opt_result {
|
|
@@ -67,8 +67,8 @@ struct ggml_opt_result {
|
|
| 67 |
std::vector<int32_t> pred;
|
| 68 |
int64_t ncorrect = 0;
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
};
|
| 73 |
|
| 74 |
// ====== Dataset ======
|
|
@@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
|
|
| 188 |
}
|
| 189 |
|
| 190 |
struct ggml_opt_params ggml_opt_default_params(
|
| 191 |
-
ggml_backend_sched_t
|
| 192 |
-
struct ggml_context
|
| 193 |
-
struct ggml_tensor
|
| 194 |
-
struct ggml_tensor
|
| 195 |
-
enum ggml_opt_loss_type
|
| 196 |
return {
|
| 197 |
/*backend_sched =*/ backend_sched,
|
| 198 |
/*ctx_compute =*/ ctx_compute,
|
|
@@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_
|
|
| 237 |
return new_tensor;
|
| 238 |
}
|
| 239 |
|
| 240 |
-
static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph *
|
| 241 |
std::map<ggml_tensor *, ggml_tensor *> tensor_map;
|
| 242 |
|
| 243 |
-
ggml_cgraph *
|
| 244 |
|
| 245 |
-
for (int i = 0; i <
|
| 246 |
-
ggml_build_forward_expand(
|
| 247 |
}
|
| 248 |
-
|
| 249 |
-
|
|
|
|
| 250 |
}
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
const size_t
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
}
|
| 257 |
|
| 258 |
-
return
|
| 259 |
}
|
| 260 |
|
| 261 |
static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
|
|
@@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph
|
|
| 284 |
|
| 285 |
ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
| 286 |
ggml_opt_context_t result = new struct ggml_opt_context;
|
| 287 |
-
result->backend_sched
|
| 288 |
-
result->
|
| 289 |
-
result->
|
| 290 |
-
result->
|
| 291 |
-
result->
|
| 292 |
-
result->
|
| 293 |
-
result->
|
| 294 |
-
result->iter = 1;
|
| 295 |
-
result->opt_period = params.opt_period;
|
| 296 |
-
result->opt_i = 0;
|
| 297 |
-
result->get_opt_pars = params.get_opt_pars;
|
| 298 |
-
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
| 299 |
|
| 300 |
GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
|
| 301 |
GGML_ASSERT(result->opt_period >= 1);
|
|
@@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
|
| 348 |
|
| 349 |
switch (params.loss_type) {
|
| 350 |
case GGML_OPT_LOSS_TYPE_MEAN: {
|
| 351 |
-
result->labels = nullptr;
|
| 352 |
result->loss = ggml_sum(result->ctx_static, result->outputs);
|
| 353 |
ggml_set_name(result->loss, "loss_sum");
|
| 354 |
const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
|
|
@@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
|
| 358 |
break;
|
| 359 |
}
|
| 360 |
case GGML_OPT_LOSS_TYPE_SUM: {
|
| 361 |
-
result->labels = nullptr;
|
| 362 |
result->loss = ggml_sum(result->ctx_static, result->outputs);
|
| 363 |
ggml_set_name(result->loss, "loss_sum");
|
| 364 |
result->loss_per_datapoint = false;
|
|
@@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
|
| 413 |
}
|
| 414 |
|
| 415 |
if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
|
| 416 |
-
result->gb_grad = nullptr;
|
| 417 |
-
result->gb_opt = nullptr;
|
| 418 |
-
|
| 419 |
result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
|
| 420 |
-
result->buf_static_cpu = nullptr;
|
| 421 |
-
|
| 422 |
-
ggml_opt_alloc_graph(result, result->gf);
|
| 423 |
-
|
| 424 |
return result;
|
| 425 |
}
|
| 426 |
|
|
@@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
|
| 429 |
ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
|
| 430 |
|
| 431 |
if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
|
| 432 |
-
result->gb_opt = nullptr;
|
| 433 |
-
|
| 434 |
result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
|
| 435 |
-
result->buf_static_cpu = nullptr;
|
| 436 |
-
|
| 437 |
-
ggml_opt_alloc_graph(result, result->gb_grad);
|
| 438 |
ggml_graph_reset(result->gb_grad);
|
| 439 |
-
|
| 440 |
return result;
|
| 441 |
}
|
| 442 |
|
|
@@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
|
| 466 |
|
| 467 |
result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
|
| 468 |
|
| 469 |
-
ggml_opt_alloc_graph(result, result->gb_opt);
|
| 470 |
ggml_graph_reset(result->gb_opt);
|
| 471 |
|
| 472 |
return result;
|
|
|
|
| 14 |
#include <vector>
|
| 15 |
|
| 16 |
struct ggml_opt_dataset {
|
| 17 |
+
struct ggml_context * ctx = nullptr;
|
| 18 |
+
ggml_backend_buffer_t buf = nullptr;
|
| 19 |
+
struct ggml_tensor * data = nullptr;
|
| 20 |
+
struct ggml_tensor * labels = nullptr;
|
| 21 |
|
| 22 |
+
int64_t ndata = -1;
|
| 23 |
+
int64_t ndata_shard = -1;
|
| 24 |
+
size_t nbs_data = -1;
|
| 25 |
+
size_t nbs_labels = -1;
|
| 26 |
|
| 27 |
std::vector<int64_t> permutation;
|
| 28 |
};
|
| 29 |
|
| 30 |
struct ggml_opt_context {
|
| 31 |
+
ggml_backend_sched_t backend_sched = nullptr;
|
| 32 |
+
ggml_cgraph * allocated_graph = nullptr;
|
| 33 |
+
ggml_cgraph * allocated_graph_copy = nullptr;
|
| 34 |
+
struct ggml_context * ctx_static = nullptr;
|
| 35 |
+
struct ggml_context * ctx_static_cpu = nullptr;
|
| 36 |
+
struct ggml_context * ctx_compute = nullptr;
|
| 37 |
+
struct ggml_context * ctx_copy = nullptr;
|
| 38 |
+
ggml_backend_buffer_t buf_static = nullptr;
|
| 39 |
+
ggml_backend_buffer_t buf_static_cpu = nullptr;
|
| 40 |
std::mt19937 rng;
|
| 41 |
|
| 42 |
+
struct ggml_tensor * inputs = nullptr;
|
| 43 |
+
struct ggml_tensor * outputs = nullptr;
|
| 44 |
+
struct ggml_tensor * labels = nullptr;
|
| 45 |
|
| 46 |
+
struct ggml_tensor * loss = nullptr;
|
| 47 |
+
struct ggml_tensor * pred = nullptr;
|
| 48 |
+
struct ggml_tensor * ncorrect = nullptr;
|
| 49 |
|
| 50 |
+
struct ggml_cgraph * gf = nullptr;
|
| 51 |
+
struct ggml_cgraph * gb_grad = nullptr;
|
| 52 |
+
struct ggml_cgraph * gb_opt = nullptr;
|
| 53 |
|
| 54 |
+
int64_t iter = 1;
|
| 55 |
+
int32_t opt_period = 1;
|
| 56 |
+
int32_t opt_i = 0;
|
| 57 |
+
bool loss_per_datapoint = false;
|
| 58 |
|
| 59 |
+
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
|
| 60 |
+
void * get_opt_pars_ud = nullptr;
|
| 61 |
+
struct ggml_tensor * adamw_params = nullptr;
|
| 62 |
};
|
| 63 |
|
| 64 |
struct ggml_opt_result {
|
|
|
|
| 67 |
std::vector<int32_t> pred;
|
| 68 |
int64_t ncorrect = 0;
|
| 69 |
|
| 70 |
+
int64_t opt_period = -1;
|
| 71 |
+
bool loss_per_datapoint = false;
|
| 72 |
};
|
| 73 |
|
| 74 |
// ====== Dataset ======
|
|
|
|
| 188 |
}
|
| 189 |
|
| 190 |
struct ggml_opt_params ggml_opt_default_params(
|
| 191 |
+
ggml_backend_sched_t backend_sched,
|
| 192 |
+
struct ggml_context * ctx_compute,
|
| 193 |
+
struct ggml_tensor * inputs,
|
| 194 |
+
struct ggml_tensor * outputs,
|
| 195 |
+
enum ggml_opt_loss_type loss_type) {
|
| 196 |
return {
|
| 197 |
/*backend_sched =*/ backend_sched,
|
| 198 |
/*ctx_compute =*/ ctx_compute,
|
|
|
|
| 237 |
return new_tensor;
|
| 238 |
}
|
| 239 |
|
| 240 |
+
static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
|
| 241 |
std::map<ggml_tensor *, ggml_tensor *> tensor_map;
|
| 242 |
|
| 243 |
+
ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);
|
| 244 |
|
| 245 |
+
for (int i = 0; i < src->n_leafs; i++) {
|
| 246 |
+
ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
|
| 247 |
}
|
| 248 |
+
GGML_ASSERT(dst->n_leafs == src->n_leafs);
|
| 249 |
+
for (int i = 0; i < src->n_nodes; i++) {
|
| 250 |
+
ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
|
| 251 |
}
|
| 252 |
+
GGML_ASSERT(dst->n_nodes == src->n_nodes);
|
| 253 |
+
for (int i = 0; i < src->n_nodes; ++i) {
|
| 254 |
+
const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
|
| 255 |
+
const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
|
| 256 |
+
|
| 257 |
+
GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
|
| 258 |
+
GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
|
| 259 |
+
GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
|
| 260 |
+
GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
|
| 261 |
+
|
| 262 |
+
dst->grads[igrad_dst] = src->grads[igrad_src];
|
| 263 |
+
dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
|
| 264 |
}
|
| 265 |
|
| 266 |
+
return dst;
|
| 267 |
}
|
| 268 |
|
| 269 |
static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
|
|
|
|
| 292 |
|
| 293 |
ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
| 294 |
ggml_opt_context_t result = new struct ggml_opt_context;
|
| 295 |
+
result->backend_sched = params.backend_sched;
|
| 296 |
+
result->ctx_compute = params.ctx_compute;
|
| 297 |
+
result->inputs = params.inputs;
|
| 298 |
+
result->outputs = params.outputs;
|
| 299 |
+
result->opt_period = params.opt_period;
|
| 300 |
+
result->get_opt_pars = params.get_opt_pars;
|
| 301 |
+
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
|
| 304 |
GGML_ASSERT(result->opt_period >= 1);
|
|
|
|
| 351 |
|
| 352 |
switch (params.loss_type) {
|
| 353 |
case GGML_OPT_LOSS_TYPE_MEAN: {
|
|
|
|
| 354 |
result->loss = ggml_sum(result->ctx_static, result->outputs);
|
| 355 |
ggml_set_name(result->loss, "loss_sum");
|
| 356 |
const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
|
|
|
|
| 360 |
break;
|
| 361 |
}
|
| 362 |
case GGML_OPT_LOSS_TYPE_SUM: {
|
|
|
|
| 363 |
result->loss = ggml_sum(result->ctx_static, result->outputs);
|
| 364 |
ggml_set_name(result->loss, "loss_sum");
|
| 365 |
result->loss_per_datapoint = false;
|
|
|
|
| 414 |
}
|
| 415 |
|
| 416 |
if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
|
|
|
|
|
|
|
|
|
|
| 417 |
result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
return result;
|
| 419 |
}
|
| 420 |
|
|
|
|
| 423 |
ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
|
| 424 |
|
| 425 |
if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
|
|
|
|
|
|
|
| 426 |
result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
|
|
|
|
|
|
|
|
|
|
| 427 |
ggml_graph_reset(result->gb_grad);
|
|
|
|
| 428 |
return result;
|
| 429 |
}
|
| 430 |
|
|
|
|
| 454 |
|
| 455 |
result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
|
| 456 |
|
|
|
|
| 457 |
ggml_graph_reset(result->gb_opt);
|
| 458 |
|
| 459 |
return result;
|
ggml/src/ggml.c
CHANGED
|
@@ -5019,8 +5019,10 @@ static void ggml_hash_map_free(struct hash_map * map) {
|
|
| 5019 |
}
|
| 5020 |
|
| 5021 |
// utility functions to change gradients
|
| 5022 |
-
//
|
| 5023 |
-
//
|
|
|
|
|
|
|
| 5024 |
// else, just add/subtract/etc. the gradients
|
| 5025 |
|
| 5026 |
static void ggml_add_or_set(
|
|
@@ -5028,11 +5030,14 @@ static void ggml_add_or_set(
|
|
| 5028 |
struct ggml_cgraph * cgraph,
|
| 5029 |
size_t isrc,
|
| 5030 |
struct ggml_tensor * tensor) {
|
|
|
|
|
|
|
| 5031 |
if (cgraph->grads[isrc]) {
|
| 5032 |
-
cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
|
| 5033 |
} else {
|
| 5034 |
cgraph->grads[isrc] = tensor;
|
| 5035 |
}
|
|
|
|
| 5036 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5037 |
}
|
| 5038 |
|
|
@@ -5040,18 +5045,20 @@ static void ggml_acc_or_set(
|
|
| 5040 |
struct ggml_context * ctx,
|
| 5041 |
struct ggml_cgraph * cgraph,
|
| 5042 |
size_t isrc,
|
| 5043 |
-
struct ggml_tensor * src,
|
| 5044 |
struct ggml_tensor * tensor,
|
| 5045 |
const size_t nb1,
|
| 5046 |
const size_t nb2,
|
| 5047 |
const size_t nb3,
|
| 5048 |
const size_t offset) {
|
|
|
|
|
|
|
| 5049 |
if (cgraph->grads[isrc]) {
|
| 5050 |
cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
|
| 5051 |
} else {
|
| 5052 |
struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
|
| 5053 |
cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
|
| 5054 |
}
|
|
|
|
| 5055 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5056 |
}
|
| 5057 |
|
|
@@ -5059,13 +5066,15 @@ static void ggml_add1_or_set(
|
|
| 5059 |
struct ggml_context * ctx,
|
| 5060 |
struct ggml_cgraph * cgraph,
|
| 5061 |
size_t isrc,
|
| 5062 |
-
struct ggml_tensor * src,
|
| 5063 |
struct ggml_tensor * tensor) {
|
|
|
|
|
|
|
| 5064 |
if (cgraph->grads[isrc]) {
|
| 5065 |
cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
|
| 5066 |
} else {
|
| 5067 |
cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
|
| 5068 |
}
|
|
|
|
| 5069 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5070 |
}
|
| 5071 |
|
|
@@ -5074,11 +5083,14 @@ static void ggml_sub_or_set(
|
|
| 5074 |
struct ggml_cgraph * cgraph,
|
| 5075 |
size_t isrc,
|
| 5076 |
struct ggml_tensor * tensor) {
|
|
|
|
|
|
|
| 5077 |
if (cgraph->grads[isrc]) {
|
| 5078 |
cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
|
| 5079 |
} else {
|
| 5080 |
cgraph->grads[isrc] = ggml_neg(ctx, tensor);
|
| 5081 |
}
|
|
|
|
| 5082 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5083 |
}
|
| 5084 |
|
|
@@ -5095,12 +5107,12 @@ static void ggml_compute_backward(
|
|
| 5095 |
struct ggml_tensor * src1 = tensor->src[1];
|
| 5096 |
struct ggml_tensor * src2 = tensor->src[2];
|
| 5097 |
struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
|
| 5098 |
-
const size_t isrc0 = ggml_hash_find(hash_set, src0);
|
| 5099 |
-
const size_t isrc1 = ggml_hash_find(hash_set, src1);
|
| 5100 |
-
const size_t isrc2 = ggml_hash_find(hash_set, src2);
|
| 5101 |
-
const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
|
| 5102 |
-
const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
|
| 5103 |
-
const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
|
| 5104 |
|
| 5105 |
switch (tensor->op) {
|
| 5106 |
case GGML_OP_DUP: {
|
|
@@ -5200,7 +5212,7 @@ static void ggml_compute_backward(
|
|
| 5200 |
} break;
|
| 5201 |
case GGML_OP_SUM: {
|
| 5202 |
if (src0_needs_grads) {
|
| 5203 |
-
ggml_add1_or_set(ctx, cgraph, isrc0,
|
| 5204 |
}
|
| 5205 |
} break;
|
| 5206 |
case GGML_OP_SUM_ROWS: {
|
|
@@ -5210,7 +5222,7 @@ static void ggml_compute_backward(
|
|
| 5210 |
} break;
|
| 5211 |
case GGML_OP_MEAN: {
|
| 5212 |
if (src0_needs_grads) {
|
| 5213 |
-
ggml_add1_or_set(ctx, cgraph, isrc0,
|
| 5214 |
}
|
| 5215 |
} break;
|
| 5216 |
case GGML_OP_REPEAT: {
|
|
@@ -5363,7 +5375,7 @@ static void ggml_compute_backward(
|
|
| 5363 |
nb3 = (nb3 / n0) * ng;
|
| 5364 |
}
|
| 5365 |
|
| 5366 |
-
ggml_acc_or_set(ctx, cgraph, isrc0,
|
| 5367 |
}
|
| 5368 |
} break;
|
| 5369 |
case GGML_OP_PERMUTE: {
|
|
@@ -5597,10 +5609,9 @@ void ggml_build_backward_expand(
|
|
| 5597 |
|
| 5598 |
const int n_nodes_f = cgraph->n_nodes;
|
| 5599 |
|
| 5600 |
-
|
| 5601 |
-
memset(cgraph->
|
| 5602 |
-
|
| 5603 |
-
bool * grads_needed = calloc(hash_size, sizeof(bool));
|
| 5604 |
|
| 5605 |
{
|
| 5606 |
bool any_params = false;
|
|
@@ -5621,7 +5632,7 @@ void ggml_build_backward_expand(
|
|
| 5621 |
continue;
|
| 5622 |
}
|
| 5623 |
|
| 5624 |
-
bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
|
| 5625 |
bool ignore_src[GGML_MAX_SRC] = {false};
|
| 5626 |
switch (node->op) {
|
| 5627 |
// gradients in node->src[0] for one reason or another have no effect on output gradients
|
|
@@ -5638,7 +5649,7 @@ void ggml_build_backward_expand(
|
|
| 5638 |
} break;
|
| 5639 |
|
| 5640 |
// gradients in node->src[1] for one reason or another have no effect on output gradients
|
| 5641 |
-
case GGML_OP_CPY: // gradients in CPY target
|
| 5642 |
case GGML_OP_GET_ROWS: // row indices not differentiable
|
| 5643 |
case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
|
| 5644 |
case GGML_OP_ROPE: // positions not differentiable
|
|
@@ -5665,9 +5676,12 @@ void ggml_build_backward_expand(
|
|
| 5665 |
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
| 5666 |
|
| 5667 |
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
|
|
|
|
|
|
|
| 5668 |
if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
|
| 5669 |
-
cgraph->
|
| 5670 |
-
cgraph->
|
|
|
|
| 5671 |
}
|
| 5672 |
grads_needed[igrad] = true;
|
| 5673 |
}
|
|
@@ -5761,15 +5775,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
| 5761 |
|
| 5762 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
| 5763 |
struct ggml_cgraph cgraph = {
|
| 5764 |
-
/*.size
|
| 5765 |
-
/*.n_nodes
|
| 5766 |
-
/*.n_leafs
|
| 5767 |
-
/*.nodes
|
| 5768 |
-
/*.grads
|
| 5769 |
-
/*.grad_accs
|
| 5770 |
-
/*.leafs
|
| 5771 |
-
/*.
|
| 5772 |
-
/*.order
|
| 5773 |
};
|
| 5774 |
|
| 5775 |
return cgraph;
|
|
@@ -5799,12 +5813,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
|
| 5799 |
}
|
| 5800 |
}
|
| 5801 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5802 |
if (src->grads) {
|
| 5803 |
GGML_ASSERT(dst->grads != NULL);
|
| 5804 |
GGML_ASSERT(dst->grad_accs != NULL);
|
| 5805 |
for (int i = 0; i < src->n_nodes; ++i) {
|
| 5806 |
const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
|
| 5807 |
const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5808 |
dst->grads[igrad_dst] = src->grads[igrad_src];
|
| 5809 |
dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
|
| 5810 |
}
|
|
@@ -5839,12 +5863,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
| 5839 |
|
| 5840 |
if (node->op == GGML_OP_OPT_STEP_ADAMW) {
|
| 5841 |
// clear momenta
|
| 5842 |
-
|
| 5843 |
-
|
| 5844 |
-
}
|
| 5845 |
-
if (node->src[3]->data) {
|
| 5846 |
-
ggml_set_zero(node->src[3]);
|
| 5847 |
-
}
|
| 5848 |
}
|
| 5849 |
|
| 5850 |
// initial gradients of loss should be 1, 0 otherwise
|
|
|
|
| 5019 |
}
|
| 5020 |
|
| 5021 |
// utility functions to change gradients
|
| 5022 |
+
// isrc is the index of tensor in cgraph->visited_has_set.keys
|
| 5023 |
+
// the corresponding gradient (accumulators) are also at position isrc
|
| 5024 |
+
// if tensor has a gradient accumulator, modify that accumulator in-place
|
| 5025 |
+
// else if there is no gradient for tensor, set the corresponding value
|
| 5026 |
// else, just add/subtract/etc. the gradients
|
| 5027 |
|
| 5028 |
static void ggml_add_or_set(
|
|
|
|
| 5030 |
struct ggml_cgraph * cgraph,
|
| 5031 |
size_t isrc,
|
| 5032 |
struct ggml_tensor * tensor) {
|
| 5033 |
+
struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
|
| 5034 |
+
GGML_ASSERT(src);
|
| 5035 |
if (cgraph->grads[isrc]) {
|
| 5036 |
+
cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
|
| 5037 |
} else {
|
| 5038 |
cgraph->grads[isrc] = tensor;
|
| 5039 |
}
|
| 5040 |
+
ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
|
| 5041 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5042 |
}
|
| 5043 |
|
|
|
|
| 5045 |
struct ggml_context * ctx,
|
| 5046 |
struct ggml_cgraph * cgraph,
|
| 5047 |
size_t isrc,
|
|
|
|
| 5048 |
struct ggml_tensor * tensor,
|
| 5049 |
const size_t nb1,
|
| 5050 |
const size_t nb2,
|
| 5051 |
const size_t nb3,
|
| 5052 |
const size_t offset) {
|
| 5053 |
+
struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
|
| 5054 |
+
GGML_ASSERT(src);
|
| 5055 |
if (cgraph->grads[isrc]) {
|
| 5056 |
cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
|
| 5057 |
} else {
|
| 5058 |
struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
|
| 5059 |
cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
|
| 5060 |
}
|
| 5061 |
+
ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
|
| 5062 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5063 |
}
|
| 5064 |
|
|
|
|
| 5066 |
struct ggml_context * ctx,
|
| 5067 |
struct ggml_cgraph * cgraph,
|
| 5068 |
size_t isrc,
|
|
|
|
| 5069 |
struct ggml_tensor * tensor) {
|
| 5070 |
+
struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
|
| 5071 |
+
GGML_ASSERT(src);
|
| 5072 |
if (cgraph->grads[isrc]) {
|
| 5073 |
cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
|
| 5074 |
} else {
|
| 5075 |
cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
|
| 5076 |
}
|
| 5077 |
+
ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
|
| 5078 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5079 |
}
|
| 5080 |
|
|
|
|
| 5083 |
struct ggml_cgraph * cgraph,
|
| 5084 |
size_t isrc,
|
| 5085 |
struct ggml_tensor * tensor) {
|
| 5086 |
+
struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
|
| 5087 |
+
GGML_ASSERT(src);
|
| 5088 |
if (cgraph->grads[isrc]) {
|
| 5089 |
cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
|
| 5090 |
} else {
|
| 5091 |
cgraph->grads[isrc] = ggml_neg(ctx, tensor);
|
| 5092 |
}
|
| 5093 |
+
ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
|
| 5094 |
ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
|
| 5095 |
}
|
| 5096 |
|
|
|
|
| 5107 |
struct ggml_tensor * src1 = tensor->src[1];
|
| 5108 |
struct ggml_tensor * src2 = tensor->src[2];
|
| 5109 |
struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
|
| 5110 |
+
const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
|
| 5111 |
+
const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
|
| 5112 |
+
const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
|
| 5113 |
+
const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
|
| 5114 |
+
const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
|
| 5115 |
+
const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
|
| 5116 |
|
| 5117 |
switch (tensor->op) {
|
| 5118 |
case GGML_OP_DUP: {
|
|
|
|
| 5212 |
} break;
|
| 5213 |
case GGML_OP_SUM: {
|
| 5214 |
if (src0_needs_grads) {
|
| 5215 |
+
ggml_add1_or_set(ctx, cgraph, isrc0, grad);
|
| 5216 |
}
|
| 5217 |
} break;
|
| 5218 |
case GGML_OP_SUM_ROWS: {
|
|
|
|
| 5222 |
} break;
|
| 5223 |
case GGML_OP_MEAN: {
|
| 5224 |
if (src0_needs_grads) {
|
| 5225 |
+
ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
|
| 5226 |
}
|
| 5227 |
} break;
|
| 5228 |
case GGML_OP_REPEAT: {
|
|
|
|
| 5375 |
nb3 = (nb3 / n0) * ng;
|
| 5376 |
}
|
| 5377 |
|
| 5378 |
+
ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
|
| 5379 |
}
|
| 5380 |
} break;
|
| 5381 |
case GGML_OP_PERMUTE: {
|
|
|
|
| 5609 |
|
| 5610 |
const int n_nodes_f = cgraph->n_nodes;
|
| 5611 |
|
| 5612 |
+
memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
|
| 5613 |
+
memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
|
| 5614 |
+
bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
|
|
|
|
| 5615 |
|
| 5616 |
{
|
| 5617 |
bool any_params = false;
|
|
|
|
| 5632 |
continue;
|
| 5633 |
}
|
| 5634 |
|
| 5635 |
+
bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
|
| 5636 |
bool ignore_src[GGML_MAX_SRC] = {false};
|
| 5637 |
switch (node->op) {
|
| 5638 |
// gradients in node->src[0] for one reason or another have no effect on output gradients
|
|
|
|
| 5649 |
} break;
|
| 5650 |
|
| 5651 |
// gradients in node->src[1] for one reason or another have no effect on output gradients
|
| 5652 |
+
case GGML_OP_CPY: // gradients in CPY target are irrelevant
|
| 5653 |
case GGML_OP_GET_ROWS: // row indices not differentiable
|
| 5654 |
case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
|
| 5655 |
case GGML_OP_ROPE: // positions not differentiable
|
|
|
|
| 5676 |
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
| 5677 |
|
| 5678 |
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
|
| 5679 |
+
GGML_ASSERT(igrad != GGML_HASHSET_FULL);
|
| 5680 |
+
GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
|
| 5681 |
if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
|
| 5682 |
+
cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
|
| 5683 |
+
cgraph->grads[igrad] = cgraph->grad_accs[igrad];
|
| 5684 |
+
ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
|
| 5685 |
}
|
| 5686 |
grads_needed[igrad] = true;
|
| 5687 |
}
|
|
|
|
| 5775 |
|
| 5776 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
| 5777 |
struct ggml_cgraph cgraph = {
|
| 5778 |
+
/*.size =*/ 0,
|
| 5779 |
+
/*.n_nodes =*/ i1 - i0,
|
| 5780 |
+
/*.n_leafs =*/ 0,
|
| 5781 |
+
/*.nodes =*/ cgraph0->nodes + i0,
|
| 5782 |
+
/*.grads =*/ NULL, // gradients would need visited_hash_set
|
| 5783 |
+
/*.grad_accs =*/ NULL,
|
| 5784 |
+
/*.leafs =*/ NULL,
|
| 5785 |
+
/*.visited_hash_set =*/ { 0, NULL, NULL },
|
| 5786 |
+
/*.order =*/ cgraph0->order,
|
| 5787 |
};
|
| 5788 |
|
| 5789 |
return cgraph;
|
|
|
|
| 5813 |
}
|
| 5814 |
}
|
| 5815 |
|
| 5816 |
+
if (dst->grads) {
|
| 5817 |
+
memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
|
| 5818 |
+
memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
|
| 5819 |
+
}
|
| 5820 |
if (src->grads) {
|
| 5821 |
GGML_ASSERT(dst->grads != NULL);
|
| 5822 |
GGML_ASSERT(dst->grad_accs != NULL);
|
| 5823 |
for (int i = 0; i < src->n_nodes; ++i) {
|
| 5824 |
const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
|
| 5825 |
const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
|
| 5826 |
+
|
| 5827 |
+
GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
|
| 5828 |
+
GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
|
| 5829 |
+
GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
|
| 5830 |
+
GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
|
| 5831 |
+
|
| 5832 |
dst->grads[igrad_dst] = src->grads[igrad_src];
|
| 5833 |
dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
|
| 5834 |
}
|
|
|
|
| 5863 |
|
| 5864 |
if (node->op == GGML_OP_OPT_STEP_ADAMW) {
|
| 5865 |
// clear momenta
|
| 5866 |
+
ggml_set_zero(node->src[2]);
|
| 5867 |
+
ggml_set_zero(node->src[3]);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5868 |
}
|
| 5869 |
|
| 5870 |
// initial gradients of loss should be 1, 0 otherwise
|