Spaces:
Sleeping
Sleeping
snadampal
commited on
ggml : update softmax n_task calculation (llama/5126)
Browse filesupdated the n_task calculation to use max number of
threads possible. This has improved the prompt eval
performance by around 5% for DOT kernels and by
around 10% for MMLA kernels on AWS Graviton3.
ggml.c
CHANGED
|
@@ -16602,7 +16602,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 16602 |
} break;
|
| 16603 |
case GGML_OP_SOFT_MAX:
|
| 16604 |
{
|
| 16605 |
-
n_tasks = MIN(
|
| 16606 |
} break;
|
| 16607 |
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 16608 |
{
|
|
|
|
| 16602 |
} break;
|
| 16603 |
case GGML_OP_SOFT_MAX:
|
| 16604 |
{
|
| 16605 |
+
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
| 16606 |
} break;
|
| 16607 |
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 16608 |
{
|