Spaces:
Running
Running
metal : simplify f16 and f32 dequant kernels (llama/0)
Browse files
ggml/src/ggml-metal.metal
CHANGED
|
@@ -19,18 +19,12 @@ constexpr constant static float kvalues_iq4nl_f[16] = {
|
|
| 19 |
// NOTE: this is not dequantizing - we are simply fitting the template
|
| 20 |
template <typename type4x4>
|
| 21 |
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
| 22 |
-
|
| 23 |
-
for (int i = 0; i < 16; i++){
|
| 24 |
-
reg[i/4][i%4] = temp[i/4][i%4];
|
| 25 |
-
}
|
| 26 |
}
|
| 27 |
|
| 28 |
template <typename type4x4>
|
| 29 |
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
| 30 |
-
|
| 31 |
-
for (int i = 0; i < 16; i++){
|
| 32 |
-
reg[i/4][i%4] = temp[i/4][i%4];
|
| 33 |
-
}
|
| 34 |
}
|
| 35 |
|
| 36 |
template <typename type4x4>
|
|
|
|
| 19 |
// NOTE: this is not dequantizing - we are simply fitting the template
|
| 20 |
template <typename type4x4>
|
| 21 |
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
| 22 |
+
reg = (type4x4)(*src);
|
|
|
|
|
|
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
template <typename type4x4>
|
| 26 |
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
| 27 |
+
reg = (type4x4)(*src);
|
|
|
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
|
| 30 |
template <typename type4x4>
|