jeffbolznv commited on
Commit
c31abdb
·
1 Parent(s): 50a2978

vulkan: copy iq4_nl LUT into shared memory (llama/10409)

Browse files
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp CHANGED
@@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
10
  void main() {
11
  const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12
 
 
 
13
  const uint tid = gl_LocalInvocationID.x % 64;
14
  const uint il = tid/32;
15
  const uint ir = tid%32;
 
10
  void main() {
11
  const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12
 
13
+ init_iq4nl_shmem();
14
+
15
  const uint tid = gl_LocalInvocationID.x % 64;
16
  const uint il = tid/32;
17
  const uint ir = tid%32;
ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp CHANGED
@@ -12,6 +12,10 @@ void main() {
12
  const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
13
  const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
14
 
 
 
 
 
15
  if (i00 >= p.ne00) {
16
  return;
17
  }
 
12
  const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
13
  const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
14
 
15
+ #if defined(DATA_A_IQ4_NL)
16
+ init_iq4nl_shmem();
17
+ #endif
18
+
19
  if (i00 >= p.ne00) {
20
  return;
21
  }
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp CHANGED
@@ -161,6 +161,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
161
  void main() {
162
  const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
163
 
 
 
 
 
164
  // do NUM_ROWS at a time, unless there aren't enough remaining rows
165
  if (first_row + NUM_ROWS <= p.stride_d) {
166
  compute_outputs(first_row, NUM_ROWS);
 
161
  void main() {
162
  const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
163
 
164
+ #if defined(DATA_A_IQ4_NL)
165
+ init_iq4nl_shmem();
166
+ #endif
167
+
168
  // do NUM_ROWS at a time, unless there aren't enough remaining rows
169
  if (first_row + NUM_ROWS <= p.stride_d) {
170
  compute_outputs(first_row, NUM_ROWS);
ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp CHANGED
@@ -75,6 +75,10 @@ shared u16vec2 row_ids[3072];
75
  #endif
76
 
77
  void main() {
 
 
 
 
78
  #ifdef MUL_MAT_ID
79
  const uint expert_idx = gl_GlobalInvocationID.z;
80
  #else
 
75
  #endif
76
 
77
  void main() {
78
+ #if defined(DATA_A_IQ4_NL)
79
+ init_iq4nl_shmem();
80
+ #endif
81
+
82
  #ifdef MUL_MAT_ID
83
  const uint expert_idx = gl_GlobalInvocationID.z;
84
  #else
ggml/src/ggml-vulkan/vulkan-shaders/types.comp CHANGED
@@ -298,10 +298,21 @@ struct block_iq4_nl_packed16
298
  #define A_TYPE block_iq4_nl
299
  #define A_TYPE_PACKED16 block_iq4_nl_packed16
300
 
301
- const int8_t kvalues_iq4nl[16] = {
302
  int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
303
  int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
304
  };
 
 
 
 
 
 
 
 
 
 
 
305
  #endif
306
 
307
  #endif // !defined(GGML_TYPES_COMP)
 
298
  #define A_TYPE block_iq4_nl
299
  #define A_TYPE_PACKED16 block_iq4_nl_packed16
300
 
301
+ const int8_t kvalues_iq4nl_const[16] = {
302
  int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
303
  int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
304
  };
305
+
306
+ shared FLOAT_TYPE kvalues_iq4nl[16];
307
+
308
+ void init_iq4nl_shmem()
309
+ {
310
+ // copy the table into shared memory and sync
311
+ if (gl_LocalInvocationIndex.x < 16) {
312
+ kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
313
+ }
314
+ barrier();
315
+ }
316
  #endif
317
 
318
  #endif // !defined(GGML_TYPES_COMP)
ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp CHANGED
@@ -331,11 +331,11 @@ void process_shaders() {
331
  shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
332
 
333
  if (tname == "f16") {
334
- string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
335
  } else {
336
- string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
337
  }
338
- string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
339
  }
340
  }
341
 
 
331
  shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
332
 
333
  if (tname == "f16") {
334
+ string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
335
  } else {
336
+ string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
337
  }
338
+ string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
339
  }
340
  }
341