ggerganov commited on
Commit
295521c
·
1 Parent(s): af0525c

metal : simplify f16 and f32 dequant kernels (llama/0)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-metal.metal +2 -8
ggml/src/ggml-metal.metal CHANGED
@@ -19,18 +19,12 @@ constexpr constant static float kvalues_iq4nl_f[16] = {
19
  // NOTE: this is not dequantizing - we are simply fitting the template
20
  template <typename type4x4>
21
  void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
22
- float4x4 temp = *(((device float4x4 *)src));
23
- for (int i = 0; i < 16; i++){
24
- reg[i/4][i%4] = temp[i/4][i%4];
25
- }
26
  }
27
 
28
  template <typename type4x4>
29
  void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
30
- half4x4 temp = *(((device half4x4 *)src));
31
- for (int i = 0; i < 16; i++){
32
- reg[i/4][i%4] = temp[i/4][i%4];
33
- }
34
  }
35
 
36
  template <typename type4x4>
 
19
  // NOTE: this is not dequantizing - we are simply fitting the template
20
  template <typename type4x4>
21
  void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
22
+ reg = (type4x4)(*src);
 
 
 
23
  }
24
 
25
  template <typename type4x4>
26
  void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
27
+ reg = (type4x4)(*src);
 
 
 
28
  }
29
 
30
  template <typename type4x4>