smeso commited on
Commit
b2602d7
·
1 Parent(s): ad34655

vulkan : argsort barriers must be under uniform control flow (ggml/951)

Browse files

a return before a barrier (that happens only in some threads in
a workgroup) leads to UB.
While the old code actually works on some devices,
it fails on some others (i.e. "smaller" GPUs).

BTW, I think it would be better to set specialization constants
when the graph is built, in that way the local workgroup
could be sized appropriately.
But it would take a lot of work.

Signed-off-by: Salvatore Mesoraca <[email protected]>

ggml/src/vulkan-shaders/argsort.comp CHANGED
@@ -29,20 +29,18 @@ void main() {
29
  const int col = int(gl_LocalInvocationID.x);
30
  const uint row = gl_WorkGroupID.y;
31
 
32
- if (col >= p.ncols_pad) {
33
- return;
34
- }
35
-
36
  const uint row_offset = row * p.ncols;
37
 
38
  // initialize indices
39
- dst_row[col] = col;
 
 
40
  barrier();
41
 
42
  for (uint k = 2; k <= p.ncols_pad; k *= 2) {
43
  for (uint j = k / 2; j > 0; j /= 2) {
44
  const uint ixj = col ^ j;
45
- if (ixj > col) {
46
  if ((col & k) == 0) {
47
  if (dst_row[col] >= p.ncols ||
48
  (dst_row[ixj] < p.ncols && (p.order == ASC ?
 
29
  const int col = int(gl_LocalInvocationID.x);
30
  const uint row = gl_WorkGroupID.y;
31
 
 
 
 
 
32
  const uint row_offset = row * p.ncols;
33
 
34
  // initialize indices
35
+ if (col < p.ncols_pad) {
36
+ dst_row[col] = col;
37
+ }
38
  barrier();
39
 
40
  for (uint k = 2; k <= p.ncols_pad; k *= 2) {
41
  for (uint j = k / 2; j > 0; j /= 2) {
42
  const uint ixj = col ^ j;
43
+ if (col < p.ncols_pad && ixj > col) {
44
  if ((col & k) == 0) {
45
  if (dst_row[col] >= p.ncols ||
46
  (dst_row[ixj] < p.ncols && (p.order == ASC ?