Spaces:
Sleeping
Sleeping
Commit
·
afa1447
1
Parent(s):
06acee2
CUDA: refactor and optimize IQ MMVQ (llama/8215)
Browse files* CUDA: refactor and optimize IQ MMVQ
* uint -> uint32_t
* __dp4a -> ggml_cuda_dp4a
* remove MIN_CC_DP4A checks
* change default
* try CI fix
- ggml/src/ggml-common.h +7 -7
- ggml/src/ggml-cuda.cu +5 -7
- ggml/src/ggml-cuda/common.cuh +51 -25
- ggml/src/ggml-cuda/fattn-common.cuh +5 -45
- ggml/src/ggml-cuda/mmvq.cu +16 -10
- ggml/src/ggml-cuda/vecdotq.cuh +319 -369
ggml/src/ggml-common.h
CHANGED
|
@@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
|
|
| 106 |
#define QR6_K 2
|
| 107 |
|
| 108 |
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
| 109 |
-
#define QR2_XXS
|
| 110 |
|
| 111 |
#define QI2_XS (QK_K / (4*QR2_XS))
|
| 112 |
-
#define QR2_XS
|
| 113 |
|
| 114 |
#define QI2_S (QK_K / (4*QR2_S))
|
| 115 |
-
#define QR2_S
|
| 116 |
|
| 117 |
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
| 118 |
-
#define QR3_XXS
|
| 119 |
|
| 120 |
#define QI3_XS (QK_K / (4*QR3_XS))
|
| 121 |
-
#define QR3_XS
|
| 122 |
|
| 123 |
#define QI1_S (QK_K / (4*QR1_S))
|
| 124 |
#define QR1_S 8
|
|
@@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
|
|
| 130 |
#define QR4_NL 2
|
| 131 |
|
| 132 |
#define QI4_XS (QK_K / (4*QR4_XS))
|
| 133 |
-
#define QR4_XS
|
| 134 |
|
| 135 |
#define QI3_S (QK_K / (4*QR3_S))
|
| 136 |
-
#define QR3_S
|
| 137 |
|
| 138 |
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
| 139 |
|
|
|
|
| 106 |
#define QR6_K 2
|
| 107 |
|
| 108 |
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
| 109 |
+
#define QR2_XXS 4
|
| 110 |
|
| 111 |
#define QI2_XS (QK_K / (4*QR2_XS))
|
| 112 |
+
#define QR2_XS 4
|
| 113 |
|
| 114 |
#define QI2_S (QK_K / (4*QR2_S))
|
| 115 |
+
#define QR2_S 4
|
| 116 |
|
| 117 |
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
| 118 |
+
#define QR3_XXS 4
|
| 119 |
|
| 120 |
#define QI3_XS (QK_K / (4*QR3_XS))
|
| 121 |
+
#define QR3_XS 4
|
| 122 |
|
| 123 |
#define QI1_S (QK_K / (4*QR1_S))
|
| 124 |
#define QR1_S 8
|
|
|
|
| 130 |
#define QR4_NL 2
|
| 131 |
|
| 132 |
#define QI4_XS (QK_K / (4*QR4_XS))
|
| 133 |
+
#define QR4_XS 2
|
| 134 |
|
| 135 |
#define QI3_S (QK_K / (4*QR3_S))
|
| 136 |
+
#define QR3_S 4
|
| 137 |
|
| 138 |
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
| 139 |
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -1883,6 +1883,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
| 1883 |
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
| 1884 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
| 1885 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1886 |
bool any_gpus_with_slow_fp16 = false;
|
| 1887 |
|
| 1888 |
if (split) {
|
|
@@ -1895,22 +1900,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
| 1895 |
}
|
| 1896 |
|
| 1897 |
const int cc = ggml_cuda_info().devices[id].cc;
|
| 1898 |
-
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
| 1899 |
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
| 1900 |
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
| 1901 |
}
|
| 1902 |
} else {
|
| 1903 |
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
| 1904 |
-
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
| 1905 |
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
| 1906 |
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
| 1907 |
}
|
| 1908 |
|
| 1909 |
-
// if mmvq is available it's a better choice than dmmv:
|
| 1910 |
-
#ifndef GGML_CUDA_FORCE_DMMV
|
| 1911 |
-
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
| 1912 |
-
#endif // GGML_CUDA_FORCE_DMMV
|
| 1913 |
-
|
| 1914 |
// debug helpers
|
| 1915 |
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
| 1916 |
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
|
|
|
| 1883 |
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
| 1884 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
| 1885 |
|
| 1886 |
+
// if mmvq is available it's a better choice than dmmv:
|
| 1887 |
+
#ifndef GGML_CUDA_FORCE_DMMV
|
| 1888 |
+
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
| 1889 |
+
#endif // GGML_CUDA_FORCE_DMMV
|
| 1890 |
+
|
| 1891 |
bool any_gpus_with_slow_fp16 = false;
|
| 1892 |
|
| 1893 |
if (split) {
|
|
|
|
| 1900 |
}
|
| 1901 |
|
| 1902 |
const int cc = ggml_cuda_info().devices[id].cc;
|
|
|
|
| 1903 |
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
| 1904 |
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
| 1905 |
}
|
| 1906 |
} else {
|
| 1907 |
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
|
|
|
| 1908 |
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
| 1909 |
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
| 1910 |
}
|
| 1911 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1912 |
// debug helpers
|
| 1913 |
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
| 1914 |
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
#include "ggml.h"
|
| 4 |
#include "ggml-cuda.h"
|
| 5 |
|
|
|
|
| 6 |
#include <memory>
|
| 7 |
|
| 8 |
#if defined(GGML_USE_HIPBLAS)
|
|
@@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
|
|
| 268 |
return c;
|
| 269 |
}
|
| 270 |
|
| 271 |
-
static __device__ __forceinline__ int
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
#
|
| 277 |
-
int
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
| 281 |
-
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
| 282 |
-
v_add3_u32 %0, %1, %2, %0 \n \
|
| 283 |
-
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
| 284 |
-
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
| 285 |
-
v_add3_u32 %0, %1, %2, %0 \n \
|
| 286 |
-
"
|
| 287 |
-
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
| 288 |
-
: "v"(a), "v"(b)
|
| 289 |
-
);
|
| 290 |
-
#else
|
| 291 |
-
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
| 292 |
-
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 293 |
-
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
| 294 |
-
#endif
|
| 295 |
return c;
|
| 296 |
}
|
| 297 |
|
|
@@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
|
| 467 |
}
|
| 468 |
#endif // CUDART_VERSION < 12000
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
// TODO: move to ggml-common.h
|
| 471 |
-
static
|
| 472 |
|
| 473 |
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
| 474 |
|
|
|
|
| 3 |
#include "ggml.h"
|
| 4 |
#include "ggml-cuda.h"
|
| 5 |
|
| 6 |
+
#include <cstdint>
|
| 7 |
#include <memory>
|
| 8 |
|
| 9 |
#if defined(GGML_USE_HIPBLAS)
|
|
|
|
| 269 |
return c;
|
| 270 |
}
|
| 271 |
|
| 272 |
+
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
| 273 |
+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
| 274 |
+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
| 275 |
+
unsigned int c;
|
| 276 |
+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
| 277 |
+
#pragma unroll
|
| 278 |
+
for (int i = 0; i < 4; ++i) {
|
| 279 |
+
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
| 280 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
return c;
|
| 282 |
}
|
| 283 |
|
|
|
|
| 453 |
}
|
| 454 |
#endif // CUDART_VERSION < 12000
|
| 455 |
|
| 456 |
+
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
| 457 |
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 458 |
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
| 459 |
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 460 |
+
#elif defined(RDNA3)
|
| 461 |
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
| 462 |
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
| 463 |
+
int tmp1;
|
| 464 |
+
int tmp2;
|
| 465 |
+
asm("\n \
|
| 466 |
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
| 467 |
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
| 468 |
+
v_add3_u32 %0, %1, %2, %0 \n \
|
| 469 |
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
| 470 |
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
| 471 |
+
v_add3_u32 %0, %1, %2, %0 \n \
|
| 472 |
+
"
|
| 473 |
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
| 474 |
+
: "v"(a), "v"(b)
|
| 475 |
+
);
|
| 476 |
+
#else
|
| 477 |
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
| 478 |
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 479 |
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
| 480 |
+
#endif
|
| 481 |
+
return c;
|
| 482 |
+
|
| 483 |
+
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 484 |
+
|
| 485 |
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 486 |
+
return __dp4a(a, b, c);
|
| 487 |
+
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 488 |
+
const int8_t * a8 = (const int8_t *) &a;
|
| 489 |
+
const int8_t * b8 = (const int8_t *) &b;
|
| 490 |
+
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 491 |
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 492 |
+
|
| 493 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
// TODO: move to ggml-common.h
|
| 497 |
+
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 498 |
|
| 499 |
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
| 500 |
|
ggml/src/ggml-cuda/fattn-common.cuh
CHANGED
|
@@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
|
|
| 54 |
template<typename T, int D>
|
| 55 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
| 56 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
| 57 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 58 |
|
| 59 |
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
| 60 |
GGML_UNUSED(Q_v);
|
| 61 |
|
| 62 |
-
|
| 63 |
|
| 64 |
#pragma unroll
|
| 65 |
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
|
@@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
|
| 72 |
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
| 73 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 74 |
|
| 75 |
-
const int sumi =
|
| 76 |
|
| 77 |
#ifdef FP16_AVAILABLE
|
| 78 |
if (std::is_same<T, half>::value) {
|
|
@@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
|
| 90 |
}
|
| 91 |
|
| 92 |
return sum;
|
| 93 |
-
#else
|
| 94 |
-
GGML_UNUSED(K_c);
|
| 95 |
-
GGML_UNUSED(Q_v);
|
| 96 |
-
GGML_UNUSED(Q_q8);
|
| 97 |
-
GGML_UNUSED(Q_ds_v);
|
| 98 |
-
NO_DEVICE_CODE;
|
| 99 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 100 |
}
|
| 101 |
|
| 102 |
template<typename T, int D>
|
| 103 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
| 104 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
| 105 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 106 |
|
| 107 |
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
| 108 |
GGML_UNUSED(Q_v);
|
|
@@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
|
| 120 |
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
| 121 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 122 |
|
| 123 |
-
const int sumi =
|
| 124 |
|
| 125 |
#ifdef FP16_AVAILABLE
|
| 126 |
if (std::is_same<T, half>::value) {
|
|
@@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
|
| 142 |
}
|
| 143 |
|
| 144 |
return sum;
|
| 145 |
-
#else
|
| 146 |
-
GGML_UNUSED(K_c);
|
| 147 |
-
GGML_UNUSED(Q_v);
|
| 148 |
-
GGML_UNUSED(Q_q8);
|
| 149 |
-
GGML_UNUSED(Q_ds_v);
|
| 150 |
-
NO_DEVICE_CODE;
|
| 151 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 152 |
}
|
| 153 |
|
| 154 |
template<typename T, int D>
|
| 155 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
| 156 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
| 157 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 158 |
|
| 159 |
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
| 160 |
GGML_UNUSED(Q_v);
|
|
@@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
|
| 179 |
|
| 180 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 181 |
|
| 182 |
-
const int sumi =
|
| 183 |
|
| 184 |
#ifdef FP16_AVAILABLE
|
| 185 |
if (std::is_same<T, half>::value) {
|
|
@@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
|
| 197 |
}
|
| 198 |
|
| 199 |
return sum;
|
| 200 |
-
#else
|
| 201 |
-
GGML_UNUSED(K_c);
|
| 202 |
-
GGML_UNUSED(Q_v);
|
| 203 |
-
GGML_UNUSED(Q_q8);
|
| 204 |
-
GGML_UNUSED(Q_ds_v);
|
| 205 |
-
NO_DEVICE_CODE;
|
| 206 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 207 |
}
|
| 208 |
|
| 209 |
template<typename T, int D>
|
| 210 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
| 211 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
| 212 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 213 |
|
| 214 |
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
| 215 |
GGML_UNUSED(Q_v);
|
|
@@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
|
| 234 |
|
| 235 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 236 |
|
| 237 |
-
const int sumi =
|
| 238 |
|
| 239 |
#ifdef FP16_AVAILABLE
|
| 240 |
if (std::is_same<T, half>::value) {
|
|
@@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
|
| 256 |
}
|
| 257 |
|
| 258 |
return sum;
|
| 259 |
-
#else
|
| 260 |
-
GGML_UNUSED(K_c);
|
| 261 |
-
GGML_UNUSED(Q_v);
|
| 262 |
-
GGML_UNUSED(Q_q8);
|
| 263 |
-
GGML_UNUSED(Q_ds_v);
|
| 264 |
-
NO_DEVICE_CODE;
|
| 265 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 266 |
}
|
| 267 |
|
| 268 |
template <typename T, int D>
|
| 269 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
| 270 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
| 271 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 272 |
|
| 273 |
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
| 274 |
GGML_UNUSED(Q_v);
|
|
@@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
|
| 297 |
}
|
| 298 |
|
| 299 |
return sum;
|
| 300 |
-
#else
|
| 301 |
-
GGML_UNUSED(K_c);
|
| 302 |
-
GGML_UNUSED(Q_v);
|
| 303 |
-
GGML_UNUSED(Q_q8);
|
| 304 |
-
GGML_UNUSED(Q_ds_v);
|
| 305 |
-
NO_DEVICE_CODE;
|
| 306 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 307 |
}
|
| 308 |
|
| 309 |
template <typename T, int D>
|
|
|
|
| 54 |
template<typename T, int D>
|
| 55 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
| 56 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
|
|
|
| 57 |
|
| 58 |
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
| 59 |
GGML_UNUSED(Q_v);
|
| 60 |
|
| 61 |
+
T sum = 0.0f;
|
| 62 |
|
| 63 |
#pragma unroll
|
| 64 |
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
|
|
|
| 71 |
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
| 72 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 73 |
|
| 74 |
+
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
| 75 |
|
| 76 |
#ifdef FP16_AVAILABLE
|
| 77 |
if (std::is_same<T, half>::value) {
|
|
|
|
| 89 |
}
|
| 90 |
|
| 91 |
return sum;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
}
|
| 93 |
|
| 94 |
template<typename T, int D>
|
| 95 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
| 96 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
|
|
|
| 97 |
|
| 98 |
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
| 99 |
GGML_UNUSED(Q_v);
|
|
|
|
| 111 |
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
| 112 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 113 |
|
| 114 |
+
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
| 115 |
|
| 116 |
#ifdef FP16_AVAILABLE
|
| 117 |
if (std::is_same<T, half>::value) {
|
|
|
|
| 133 |
}
|
| 134 |
|
| 135 |
return sum;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
}
|
| 137 |
|
| 138 |
template<typename T, int D>
|
| 139 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
| 140 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
|
|
|
| 141 |
|
| 142 |
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
| 143 |
GGML_UNUSED(Q_v);
|
|
|
|
| 162 |
|
| 163 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 164 |
|
| 165 |
+
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
| 166 |
|
| 167 |
#ifdef FP16_AVAILABLE
|
| 168 |
if (std::is_same<T, half>::value) {
|
|
|
|
| 180 |
}
|
| 181 |
|
| 182 |
return sum;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
}
|
| 184 |
|
| 185 |
template<typename T, int D>
|
| 186 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
| 187 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
|
|
|
| 188 |
|
| 189 |
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
| 190 |
GGML_UNUSED(Q_v);
|
|
|
|
| 209 |
|
| 210 |
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
| 211 |
|
| 212 |
+
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
| 213 |
|
| 214 |
#ifdef FP16_AVAILABLE
|
| 215 |
if (std::is_same<T, half>::value) {
|
|
|
|
| 231 |
}
|
| 232 |
|
| 233 |
return sum;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
}
|
| 235 |
|
| 236 |
template <typename T, int D>
|
| 237 |
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
| 238 |
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
|
|
|
| 239 |
|
| 240 |
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
| 241 |
GGML_UNUSED(Q_v);
|
|
|
|
| 264 |
}
|
| 265 |
|
| 266 |
return sum;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
template <typename T, int D>
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
|
| 28 |
|
| 29 |
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
| 30 |
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
| 31 |
-
type == GGML_TYPE_Q4_1
|
| 32 |
-
type == GGML_TYPE_Q5_0
|
| 33 |
-
type == GGML_TYPE_Q5_1
|
| 34 |
-
type == GGML_TYPE_Q8_0
|
| 35 |
-
type == GGML_TYPE_Q2_K
|
| 36 |
-
type == GGML_TYPE_Q3_K
|
| 37 |
-
type == GGML_TYPE_Q4_K
|
| 38 |
-
type == GGML_TYPE_Q5_K
|
| 39 |
-
type == GGML_TYPE_Q6_K
|
| 40 |
-
type ==
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
1;
|
| 42 |
}
|
| 43 |
|
|
|
|
| 28 |
|
| 29 |
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
| 30 |
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
| 31 |
+
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
| 32 |
+
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
| 33 |
+
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
| 34 |
+
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
| 35 |
+
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
| 36 |
+
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
| 37 |
+
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
| 38 |
+
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
| 39 |
+
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
| 40 |
+
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
| 41 |
+
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
| 42 |
+
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
| 43 |
+
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
| 44 |
+
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
| 45 |
+
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
| 46 |
+
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
| 47 |
1;
|
| 48 |
}
|
| 49 |
|
ggml/src/ggml-cuda/vecdotq.cuh
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
#include "common.cuh"
|
|
|
|
| 2 |
|
| 3 |
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
| 4 |
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
|
@@ -28,6 +29,18 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
|
| 28 |
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 33 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
|
@@ -38,7 +51,6 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
|
| 38 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
| 39 |
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
| 40 |
|
| 41 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 42 |
int sumi = 0;
|
| 43 |
|
| 44 |
#pragma unroll
|
|
@@ -47,17 +59,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
| 47 |
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
| 48 |
|
| 49 |
// SIMD dot product of quantized values
|
| 50 |
-
sumi =
|
| 51 |
-
sumi =
|
| 52 |
}
|
| 53 |
|
| 54 |
const float2 ds8f = __half22float2(ds8);
|
| 55 |
|
| 56 |
// second part effectively subtracts 8 from each quant value
|
| 57 |
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
| 58 |
-
#else
|
| 59 |
-
NO_DEVICE_CODE;
|
| 60 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 61 |
}
|
| 62 |
|
| 63 |
#define VDR_Q4_1_Q8_1_MMVQ 2
|
|
@@ -66,7 +75,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
| 66 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
| 67 |
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
| 68 |
|
| 69 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 70 |
int sumi = 0;
|
| 71 |
|
| 72 |
#pragma unroll
|
|
@@ -75,8 +83,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
| 75 |
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
| 76 |
|
| 77 |
// SIMD dot product of quantized values
|
| 78 |
-
sumi =
|
| 79 |
-
sumi =
|
| 80 |
}
|
| 81 |
|
| 82 |
#ifdef GGML_CUDA_F16
|
|
@@ -92,9 +100,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
| 92 |
|
| 93 |
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
| 94 |
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
| 95 |
-
#else
|
| 96 |
-
NO_DEVICE_CODE;
|
| 97 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 98 |
}
|
| 99 |
|
| 100 |
#define VDR_Q5_0_Q8_1_MMVQ 2
|
|
@@ -103,7 +108,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
| 103 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
| 104 |
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
| 105 |
|
| 106 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 107 |
int sumi = 0;
|
| 108 |
|
| 109 |
#pragma unroll
|
|
@@ -113,23 +117,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
| 113 |
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
| 114 |
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
| 115 |
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
| 116 |
-
sumi =
|
| 117 |
|
| 118 |
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
| 119 |
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
| 120 |
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
| 121 |
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
| 122 |
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
| 123 |
-
sumi =
|
| 124 |
}
|
| 125 |
|
| 126 |
const float2 ds8f = __half22float2(ds8);
|
| 127 |
|
| 128 |
// second part effectively subtracts 16 from each quant value
|
| 129 |
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
| 130 |
-
#else
|
| 131 |
-
NO_DEVICE_CODE;
|
| 132 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 133 |
}
|
| 134 |
|
| 135 |
#define VDR_Q5_1_Q8_1_MMVQ 2
|
|
@@ -138,7 +139,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
| 138 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
| 139 |
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
| 140 |
|
| 141 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 142 |
int sumi = 0;
|
| 143 |
|
| 144 |
#pragma unroll
|
|
@@ -148,14 +148,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
| 148 |
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
| 149 |
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
| 150 |
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
| 151 |
-
sumi =
|
| 152 |
|
| 153 |
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
| 154 |
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
| 155 |
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
| 156 |
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
| 157 |
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
| 158 |
-
sumi =
|
| 159 |
}
|
| 160 |
|
| 161 |
#ifdef GGML_CUDA_F16
|
|
@@ -171,10 +171,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
| 171 |
|
| 172 |
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
| 173 |
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
| 174 |
-
|
| 175 |
-
#else
|
| 176 |
-
NO_DEVICE_CODE;
|
| 177 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 178 |
}
|
| 179 |
|
| 180 |
#define VDR_Q8_0_Q8_1_MMVQ 2
|
|
@@ -183,31 +179,26 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
| 183 |
template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
|
| 184 |
const int * v, const int * u, const T & d8_0, const T & d8_1) {
|
| 185 |
|
| 186 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 187 |
int sumi = 0;
|
| 188 |
|
| 189 |
#pragma unroll
|
| 190 |
for (int i = 0; i < vdr; ++i) {
|
| 191 |
// SIMD dot product of quantized values
|
| 192 |
-
sumi =
|
| 193 |
}
|
| 194 |
|
| 195 |
return d8_0*d8_1 * ((T) sumi);
|
| 196 |
-
#else
|
| 197 |
-
NO_DEVICE_CODE;
|
| 198 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 199 |
}
|
| 200 |
|
| 201 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
| 202 |
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
| 203 |
|
| 204 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 205 |
int sumi = 0;
|
| 206 |
|
| 207 |
#pragma unroll
|
| 208 |
for (int i = 0; i < vdr; ++i) {
|
| 209 |
// SIMD dot product of quantized values
|
| 210 |
-
sumi =
|
| 211 |
}
|
| 212 |
|
| 213 |
#ifdef GGML_CUDA_F16
|
|
@@ -223,9 +214,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
| 223 |
|
| 224 |
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
| 225 |
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
| 226 |
-
#else
|
| 227 |
-
NO_DEVICE_CODE;
|
| 228 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 229 |
}
|
| 230 |
|
| 231 |
#define VDR_Q2_K_Q8_1_MMVQ 1
|
|
@@ -236,7 +224,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
| 236 |
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
| 237 |
const half2 & dm2, const float * __restrict__ d8) {
|
| 238 |
|
| 239 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 240 |
float sumf_d = 0.0f;
|
| 241 |
float sumf_m = 0.0f;
|
| 242 |
|
|
@@ -246,28 +233,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
| 246 |
|
| 247 |
const int vi = (v >> (2*i)) & 0x03030303;
|
| 248 |
|
| 249 |
-
sumf_d += d8[i] * (
|
| 250 |
|
| 251 |
// fill int with 4x m
|
| 252 |
int m = sc >> 4;
|
| 253 |
m |= m << 8;
|
| 254 |
m |= m << 16;
|
| 255 |
-
sumf_m += d8[i] *
|
| 256 |
}
|
| 257 |
|
| 258 |
const float2 dm2f = __half22float2(dm2);
|
| 259 |
|
| 260 |
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
| 261 |
-
#else
|
| 262 |
-
NO_DEVICE_CODE;
|
| 263 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 264 |
}
|
| 265 |
|
| 266 |
// contiguous u/y values
|
| 267 |
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
| 268 |
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
|
| 269 |
|
| 270 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 271 |
float sumf_d = 0.0f;
|
| 272 |
float sumf_m = 0.0f;
|
| 273 |
|
|
@@ -281,8 +264,8 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
| 281 |
#pragma unroll
|
| 282 |
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
| 283 |
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
|
| 284 |
-
sumi_d =
|
| 285 |
-
sumi_m =
|
| 286 |
}
|
| 287 |
|
| 288 |
sumf_d += dm2f.x * sumi_d;
|
|
@@ -290,9 +273,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
| 290 |
}
|
| 291 |
|
| 292 |
return d8*(sumf_d - sumf_m);
|
| 293 |
-
#else
|
| 294 |
-
NO_DEVICE_CODE;
|
| 295 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 296 |
}
|
| 297 |
|
| 298 |
#define VDR_Q3_K_Q8_1_MMVQ 1
|
|
@@ -303,7 +283,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
| 303 |
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
| 304 |
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
| 305 |
|
| 306 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 307 |
float sumf = 0.0f;
|
| 308 |
|
| 309 |
#pragma unroll
|
|
@@ -326,13 +305,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
| 326 |
|
| 327 |
const int vi = __vsubss4(vil, vih);
|
| 328 |
|
| 329 |
-
sumf += d8[i] * (
|
| 330 |
}
|
| 331 |
|
| 332 |
return d3 * sumf;
|
| 333 |
-
#else
|
| 334 |
-
NO_DEVICE_CODE;
|
| 335 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 336 |
}
|
| 337 |
|
| 338 |
// contiguous u/y values
|
|
@@ -340,7 +316,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
| 340 |
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
| 341 |
const float & d3, const float & d8) {
|
| 342 |
|
| 343 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 344 |
int sumi = 0;
|
| 345 |
|
| 346 |
#pragma unroll
|
|
@@ -350,16 +325,13 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
| 350 |
#pragma unroll
|
| 351 |
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
| 352 |
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
|
| 353 |
-
sumi_sc =
|
| 354 |
}
|
| 355 |
|
| 356 |
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
| 357 |
}
|
| 358 |
|
| 359 |
return d3*d8 * sumi;
|
| 360 |
-
#else
|
| 361 |
-
NO_DEVICE_CODE;
|
| 362 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 363 |
}
|
| 364 |
|
| 365 |
#define VDR_Q4_K_Q8_1_MMVQ 2
|
|
@@ -370,7 +342,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
| 370 |
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 371 |
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
| 372 |
|
| 373 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 374 |
float sumf_d = 0.0f;
|
| 375 |
float sumf_m = 0.0f;
|
| 376 |
|
|
@@ -379,8 +350,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
| 379 |
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
| 380 |
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
| 381 |
|
| 382 |
-
const int dot1 =
|
| 383 |
-
const int dot2 =
|
| 384 |
|
| 385 |
sumf_d += d8[i] * (dot1 * sc[i]);
|
| 386 |
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
|
@@ -389,10 +360,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
| 389 |
const float2 dm4f = __half22float2(dm4);
|
| 390 |
|
| 391 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 392 |
-
|
| 393 |
-
#else
|
| 394 |
-
NO_DEVICE_CODE;
|
| 395 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 396 |
}
|
| 397 |
|
| 398 |
// contiguous u/y values
|
|
@@ -400,7 +367,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
| 400 |
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 401 |
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
| 402 |
|
| 403 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 404 |
float sumf_d = 0.0f;
|
| 405 |
float sumf_m = 0.0f;
|
| 406 |
|
|
@@ -410,7 +376,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
| 410 |
|
| 411 |
#pragma unroll
|
| 412 |
for (int j = 0; j < QI8_1; ++j) {
|
| 413 |
-
sumi_d =
|
| 414 |
}
|
| 415 |
|
| 416 |
const float2 ds8f = __half22float2(ds8[i]);
|
|
@@ -422,10 +388,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
| 422 |
const float2 dm4f = __half22float2(dm4);
|
| 423 |
|
| 424 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 425 |
-
|
| 426 |
-
#else
|
| 427 |
-
NO_DEVICE_CODE;
|
| 428 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 429 |
}
|
| 430 |
|
| 431 |
#define VDR_Q5_K_Q8_1_MMVQ 2
|
|
@@ -436,7 +398,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
| 436 |
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 437 |
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
| 438 |
|
| 439 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 440 |
float sumf_d = 0.0f;
|
| 441 |
float sumf_m = 0.0f;
|
| 442 |
|
|
@@ -451,8 +412,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
| 451 |
const int v0i = vl0i | vh0i;
|
| 452 |
const int v1i = vl1i | vh1i;
|
| 453 |
|
| 454 |
-
const int dot1 =
|
| 455 |
-
const int dot2 =
|
| 456 |
|
| 457 |
sumf_d += d8[i] * (dot1 * sc[i]);
|
| 458 |
sumf_m += d8[i] * (dot2 * m[i]);
|
|
@@ -462,10 +423,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
| 462 |
const float2 dm5f = __half22float2(dm5);
|
| 463 |
|
| 464 |
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
| 465 |
-
|
| 466 |
-
#else
|
| 467 |
-
NO_DEVICE_CODE;
|
| 468 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 469 |
}
|
| 470 |
|
| 471 |
// contiguous u/y values
|
|
@@ -473,7 +430,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
| 473 |
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 474 |
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
| 475 |
|
| 476 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 477 |
float sumf_d = 0.0f;
|
| 478 |
float sumf_m = 0.0f;
|
| 479 |
|
|
@@ -483,7 +439,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
| 483 |
|
| 484 |
#pragma unroll
|
| 485 |
for (int j = 0; j < QI8_1; ++j) {
|
| 486 |
-
sumi_d =
|
| 487 |
}
|
| 488 |
|
| 489 |
const float2 ds8f = __half22float2(ds8[i]);
|
|
@@ -495,10 +451,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
| 495 |
const float2 dm4f = __half22float2(dm4);
|
| 496 |
|
| 497 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 498 |
-
|
| 499 |
-
#else
|
| 500 |
-
NO_DEVICE_CODE;
|
| 501 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 502 |
}
|
| 503 |
|
| 504 |
#define VDR_Q6_K_Q8_1_MMVQ 1
|
|
@@ -509,7 +461,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
| 509 |
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
| 510 |
const float & d, const float * __restrict__ d8) {
|
| 511 |
|
| 512 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 513 |
float sumf = 0.0f;
|
| 514 |
|
| 515 |
#pragma unroll
|
|
@@ -522,13 +473,10 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
| 522 |
|
| 523 |
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
| 524 |
|
| 525 |
-
sumf += d8[i] * (
|
| 526 |
}
|
| 527 |
|
| 528 |
return d*sumf;
|
| 529 |
-
#else
|
| 530 |
-
NO_DEVICE_CODE;
|
| 531 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 532 |
}
|
| 533 |
|
| 534 |
// contiguous u/y values
|
|
@@ -536,7 +484,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
| 536 |
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
| 537 |
const float & d6, const float * __restrict__ d8) {
|
| 538 |
|
| 539 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 540 |
float sumf_d = 0.0f;
|
| 541 |
|
| 542 |
#pragma unroll
|
|
@@ -545,21 +492,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
| 545 |
|
| 546 |
#pragma unroll
|
| 547 |
for (int i = i0; i < i0 + 2; ++i) {
|
| 548 |
-
sumi_d.x =
|
| 549 |
-
sumi_d.x =
|
| 550 |
|
| 551 |
-
sumi_d.y =
|
| 552 |
-
sumi_d.y =
|
| 553 |
}
|
| 554 |
|
| 555 |
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
| 556 |
}
|
| 557 |
|
| 558 |
return d6 * sumf_d;
|
| 559 |
-
|
| 560 |
-
#else
|
| 561 |
-
NO_DEVICE_CODE;
|
| 562 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 563 |
}
|
| 564 |
|
| 565 |
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
@@ -572,9 +515,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
| 572 |
|
| 573 |
#pragma unroll
|
| 574 |
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
| 575 |
-
v[i] =
|
| 576 |
-
u[2*i+0] =
|
| 577 |
-
u[2*i+1] =
|
| 578 |
}
|
| 579 |
|
| 580 |
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
|
@@ -591,9 +534,9 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
| 591 |
|
| 592 |
#pragma unroll
|
| 593 |
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
|
| 594 |
-
v[i]
|
| 595 |
-
u[2*i+0] =
|
| 596 |
-
u[2*i+1] =
|
| 597 |
}
|
| 598 |
|
| 599 |
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
|
@@ -610,10 +553,10 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
| 610 |
|
| 611 |
#pragma unroll
|
| 612 |
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
|
| 613 |
-
vl[i] =
|
| 614 |
-
vh[i] =
|
| 615 |
-
u[2*i+0] =
|
| 616 |
-
u[2*i+1] =
|
| 617 |
}
|
| 618 |
|
| 619 |
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
|
@@ -630,10 +573,10 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
| 630 |
|
| 631 |
#pragma unroll
|
| 632 |
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
|
| 633 |
-
vl[i]
|
| 634 |
-
vh[i]
|
| 635 |
-
u[2*i+0] =
|
| 636 |
-
u[2*i+1] =
|
| 637 |
}
|
| 638 |
|
| 639 |
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
|
@@ -649,8 +592,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
| 649 |
|
| 650 |
#pragma unroll
|
| 651 |
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
| 652 |
-
v[i] =
|
| 653 |
-
u[i] =
|
| 654 |
}
|
| 655 |
|
| 656 |
return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
|
@@ -666,13 +609,13 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
| 666 |
|
| 667 |
const uint8_t * scales = bq2_K->scales + scale_offset;
|
| 668 |
|
| 669 |
-
const int v =
|
| 670 |
int u[QR2_K];
|
| 671 |
float d8[QR2_K];
|
| 672 |
|
| 673 |
#pragma unroll
|
| 674 |
for (int i = 0; i < QR2_K; ++ i) {
|
| 675 |
-
u[i] =
|
| 676 |
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
| 677 |
}
|
| 678 |
|
|
@@ -689,17 +632,17 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
| 689 |
|
| 690 |
const float d = bq3_K->d;
|
| 691 |
|
| 692 |
-
const int vl =
|
| 693 |
|
| 694 |
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
| 695 |
-
const int vh = ~
|
| 696 |
|
| 697 |
int u[QR3_K];
|
| 698 |
float d8[QR3_K];
|
| 699 |
|
| 700 |
#pragma unroll
|
| 701 |
for (int i = 0; i < QR3_K; ++i) {
|
| 702 |
-
u[i] =
|
| 703 |
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
| 704 |
}
|
| 705 |
|
|
@@ -807,8 +750,8 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
| 807 |
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
| 808 |
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
| 809 |
|
| 810 |
-
const int vl =
|
| 811 |
-
const int vh =
|
| 812 |
|
| 813 |
const int8_t * scales = bq6_K->scales + scale_offset;
|
| 814 |
|
|
@@ -817,335 +760,342 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
| 817 |
|
| 818 |
#pragma unroll
|
| 819 |
for (int i = 0; i < QR6_K; ++i) {
|
| 820 |
-
u[i] =
|
| 821 |
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
|
| 822 |
}
|
| 823 |
|
| 824 |
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
| 825 |
}
|
| 826 |
|
|
|
|
|
|
|
| 827 |
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
| 828 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
|
|
|
| 829 |
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
|
| 830 |
|
| 831 |
-
|
| 832 |
-
const
|
| 833 |
-
const
|
| 834 |
-
|
| 835 |
-
const int8_t * q8 = bq8_1[ib32].qs;
|
| 836 |
-
uint32_t aux32 = q2[2] | (q2[3] << 16);
|
| 837 |
int sumi = 0;
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
const
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
}
|
| 847 |
-
|
|
|
|
|
|
|
|
|
|
| 848 |
return d * sumi;
|
| 849 |
-
#else
|
| 850 |
-
// iqs is 0...15
|
| 851 |
-
const int ib32 = iqs/2;
|
| 852 |
-
const int il = iqs%2;
|
| 853 |
-
const uint16_t * q2 = bq2->qs + 4*ib32;
|
| 854 |
-
const uint8_t * aux8 = (const uint8_t *)q2;
|
| 855 |
-
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
| 856 |
-
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
| 857 |
-
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
| 858 |
-
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
| 859 |
-
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
| 860 |
-
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
| 861 |
-
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
| 862 |
-
int sumi1 = 0, sumi2 = 0;
|
| 863 |
-
for (int j = 0; j < 8; ++j) {
|
| 864 |
-
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
|
| 865 |
-
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
|
| 866 |
-
}
|
| 867 |
-
return d * (sumi1 + sumi2);
|
| 868 |
-
#endif
|
| 869 |
}
|
| 870 |
|
|
|
|
|
|
|
| 871 |
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
| 872 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 873 |
-
|
| 874 |
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
|
| 875 |
|
| 876 |
-
const
|
| 877 |
-
const uint16_t * q2 =
|
| 878 |
-
const
|
| 879 |
-
const
|
| 880 |
-
|
|
|
|
| 881 |
int sumi1 = 0;
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
const uint32_t *
|
| 885 |
-
const
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
}
|
| 901 |
-
const
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
GGML_UNUSED(ksigns64);
|
| 905 |
-
NO_DEVICE_CODE;
|
| 906 |
-
#endif
|
| 907 |
}
|
| 908 |
|
| 909 |
-
|
|
|
|
| 910 |
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
| 911 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 912 |
-
|
| 913 |
const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
|
| 914 |
|
| 915 |
-
const int
|
| 916 |
-
const
|
| 917 |
-
|
| 918 |
-
const
|
| 919 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
int sumi1 = 0;
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
const
|
| 924 |
-
|
| 925 |
-
const int
|
| 926 |
-
const int
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
| 941 |
}
|
| 942 |
-
const
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
NO_DEVICE_CODE;
|
| 947 |
-
#endif
|
| 948 |
}
|
| 949 |
|
|
|
|
|
|
|
| 950 |
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
| 951 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 952 |
-
|
| 953 |
-
const block_iq3_xxs *
|
| 954 |
-
|
| 955 |
-
const
|
| 956 |
-
const uint8_t
|
| 957 |
-
const
|
| 958 |
-
|
| 959 |
-
uint32_t aux32 = gas[0] | (gas[1] << 16);
|
| 960 |
int sumi = 0;
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
const
|
| 964 |
-
|
| 965 |
-
const int
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 971 |
}
|
| 972 |
-
|
|
|
|
|
|
|
|
|
|
| 973 |
return d * sumi;
|
| 974 |
-
#else
|
| 975 |
-
NO_DEVICE_CODE;
|
| 976 |
-
#endif
|
| 977 |
}
|
| 978 |
|
|
|
|
|
|
|
| 979 |
// TODO: don't use lookup table for signs
|
| 980 |
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
| 981 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 982 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 983 |
-
const block_iq3_s * bq2 = (const block_iq3_s *) vbq + kbx;
|
| 984 |
|
| 985 |
-
const
|
| 986 |
-
|
| 987 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 988 |
int sumi = 0;
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
const
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
const int
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
}
|
| 1000 |
-
|
|
|
|
|
|
|
|
|
|
| 1001 |
return d * sumi;
|
| 1002 |
-
#else
|
| 1003 |
-
NO_DEVICE_CODE;
|
| 1004 |
-
#endif
|
| 1005 |
}
|
| 1006 |
|
| 1007 |
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
| 1008 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 1009 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
|
| 1010 |
|
| 1011 |
-
const int
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1012 |
int sumi = 0;
|
| 1013 |
-
#
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
int grid0 = grid
|
| 1018 |
-
int grid1 = (grid
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
for (int j = 0; j < 4; ++j) {
|
| 1026 |
-
sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
|
| 1027 |
-
}
|
| 1028 |
-
q8 += 8;
|
| 1029 |
}
|
| 1030 |
-
|
| 1031 |
-
const float
|
| 1032 |
-
const float
|
| 1033 |
-
const
|
| 1034 |
-
|
| 1035 |
-
return d * sumi + m * delta;
|
| 1036 |
}
|
| 1037 |
|
| 1038 |
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
| 1039 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
|
|
|
| 1040 |
const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
|
| 1041 |
|
| 1042 |
-
const int
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
int
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
const int
|
| 1054 |
-
|
| 1055 |
-
|
| 1056 |
-
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
|
|
|
|
|
|
|
|
|
|
| 1060 |
int sumy = 0;
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
}
|
| 1065 |
-
const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
|
| 1066 |
-
sumf[l/2] += delta*sumy;
|
| 1067 |
-
q8 += 8;
|
| 1068 |
}
|
| 1069 |
-
|
|
|
|
|
|
|
| 1070 |
iq1m_scale_t scale;
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
|
|
|
|
|
|
|
|
|
| 1075 |
}
|
| 1076 |
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
val2 = v1 | (v2 << 16);
|
| 1090 |
}
|
| 1091 |
-
|
|
|
|
| 1092 |
|
| 1093 |
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
| 1094 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 1095 |
|
| 1096 |
-
const block_iq4_nl *
|
| 1097 |
|
| 1098 |
-
|
| 1099 |
-
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
| 1100 |
-
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
| 1101 |
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
int v1, v2;
|
| 1105 |
-
int sumi1 = 0, sumi2 = 0;
|
| 1106 |
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
| 1107 |
-
const
|
| 1108 |
-
|
| 1109 |
-
sumi1 = __dp4a(v1, q8[l+0], sumi1);
|
| 1110 |
-
sumi2 = __dp4a(v2, q8[l+4], sumi2);
|
| 1111 |
-
}
|
| 1112 |
-
|
| 1113 |
-
#else
|
| 1114 |
-
const uint8_t * q4 = bq->qs + 4*iqs;
|
| 1115 |
-
const int8_t * q8 = bq8_1->qs + 4*iqs;
|
| 1116 |
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
|
| 1120 |
-
sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
|
| 1121 |
}
|
| 1122 |
-
|
| 1123 |
-
const float d = (
|
| 1124 |
-
return d *
|
| 1125 |
}
|
| 1126 |
|
|
|
|
|
|
|
| 1127 |
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
| 1128 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 1129 |
|
| 1130 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 1131 |
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
const int ib32 = iqs;
|
| 1136 |
-
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
| 1137 |
-
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
|
| 1138 |
-
const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
| 1139 |
-
const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
|
| 1140 |
-
int v1, v2;
|
| 1141 |
-
int sumi1 = 0, sumi2 = 0;
|
| 1142 |
for (int j = 0; j < 4; ++j) {
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1146 |
}
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
|
|
|
|
|
|
| 1151 |
}
|
|
|
|
| 1 |
#include "common.cuh"
|
| 2 |
+
#include <cstdint>
|
| 3 |
|
| 4 |
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
| 5 |
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
|
|
|
| 29 |
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
| 30 |
}
|
| 31 |
|
| 32 |
+
static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
|
| 33 |
+
const uint16_t * x16 = (const uint16_t *) x;
|
| 34 |
+
|
| 35 |
+
int x32 = x16[2*i32 + 0] << 0;
|
| 36 |
+
x32 |= x16[2*i32 + 1] << 16;
|
| 37 |
+
|
| 38 |
+
return x32;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
|
| 42 |
+
return ((const int *) x)[i32]; // assume at least 4 byte alignment
|
| 43 |
+
}
|
| 44 |
|
| 45 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 46 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
|
|
|
| 51 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
| 52 |
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
| 53 |
|
|
|
|
| 54 |
int sumi = 0;
|
| 55 |
|
| 56 |
#pragma unroll
|
|
|
|
| 59 |
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
| 60 |
|
| 61 |
// SIMD dot product of quantized values
|
| 62 |
+
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
|
| 63 |
+
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
|
| 64 |
}
|
| 65 |
|
| 66 |
const float2 ds8f = __half22float2(ds8);
|
| 67 |
|
| 68 |
// second part effectively subtracts 8 from each quant value
|
| 69 |
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
|
|
|
|
|
|
|
|
|
| 70 |
}
|
| 71 |
|
| 72 |
#define VDR_Q4_1_Q8_1_MMVQ 2
|
|
|
|
| 75 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
| 76 |
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
| 77 |
|
|
|
|
| 78 |
int sumi = 0;
|
| 79 |
|
| 80 |
#pragma unroll
|
|
|
|
| 83 |
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
| 84 |
|
| 85 |
// SIMD dot product of quantized values
|
| 86 |
+
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
|
| 87 |
+
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
|
| 88 |
}
|
| 89 |
|
| 90 |
#ifdef GGML_CUDA_F16
|
|
|
|
| 100 |
|
| 101 |
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
| 102 |
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
|
|
|
|
|
|
|
|
|
| 103 |
}
|
| 104 |
|
| 105 |
#define VDR_Q5_0_Q8_1_MMVQ 2
|
|
|
|
| 108 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
| 109 |
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
| 110 |
|
|
|
|
| 111 |
int sumi = 0;
|
| 112 |
|
| 113 |
#pragma unroll
|
|
|
|
| 117 |
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
| 118 |
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
| 119 |
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
| 120 |
+
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
| 121 |
|
| 122 |
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
| 123 |
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
| 124 |
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
| 125 |
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
| 126 |
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
| 127 |
+
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
| 128 |
}
|
| 129 |
|
| 130 |
const float2 ds8f = __half22float2(ds8);
|
| 131 |
|
| 132 |
// second part effectively subtracts 16 from each quant value
|
| 133 |
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
|
|
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
|
| 136 |
#define VDR_Q5_1_Q8_1_MMVQ 2
|
|
|
|
| 139 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
| 140 |
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
| 141 |
|
|
|
|
| 142 |
int sumi = 0;
|
| 143 |
|
| 144 |
#pragma unroll
|
|
|
|
| 148 |
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
| 149 |
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
| 150 |
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
| 151 |
+
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
| 152 |
|
| 153 |
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
| 154 |
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
| 155 |
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
| 156 |
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
| 157 |
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
| 158 |
+
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
| 159 |
}
|
| 160 |
|
| 161 |
#ifdef GGML_CUDA_F16
|
|
|
|
| 171 |
|
| 172 |
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
| 173 |
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
}
|
| 175 |
|
| 176 |
#define VDR_Q8_0_Q8_1_MMVQ 2
|
|
|
|
| 179 |
template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
|
| 180 |
const int * v, const int * u, const T & d8_0, const T & d8_1) {
|
| 181 |
|
|
|
|
| 182 |
int sumi = 0;
|
| 183 |
|
| 184 |
#pragma unroll
|
| 185 |
for (int i = 0; i < vdr; ++i) {
|
| 186 |
// SIMD dot product of quantized values
|
| 187 |
+
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
| 188 |
}
|
| 189 |
|
| 190 |
return d8_0*d8_1 * ((T) sumi);
|
|
|
|
|
|
|
|
|
|
| 191 |
}
|
| 192 |
|
| 193 |
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
| 194 |
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
| 195 |
|
|
|
|
| 196 |
int sumi = 0;
|
| 197 |
|
| 198 |
#pragma unroll
|
| 199 |
for (int i = 0; i < vdr; ++i) {
|
| 200 |
// SIMD dot product of quantized values
|
| 201 |
+
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
| 202 |
}
|
| 203 |
|
| 204 |
#ifdef GGML_CUDA_F16
|
|
|
|
| 214 |
|
| 215 |
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
| 216 |
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
|
|
|
|
|
|
|
|
|
| 217 |
}
|
| 218 |
|
| 219 |
#define VDR_Q2_K_Q8_1_MMVQ 1
|
|
|
|
| 224 |
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
| 225 |
const half2 & dm2, const float * __restrict__ d8) {
|
| 226 |
|
|
|
|
| 227 |
float sumf_d = 0.0f;
|
| 228 |
float sumf_m = 0.0f;
|
| 229 |
|
|
|
|
| 233 |
|
| 234 |
const int vi = (v >> (2*i)) & 0x03030303;
|
| 235 |
|
| 236 |
+
sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
| 237 |
|
| 238 |
// fill int with 4x m
|
| 239 |
int m = sc >> 4;
|
| 240 |
m |= m << 8;
|
| 241 |
m |= m << 16;
|
| 242 |
+
sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
| 243 |
}
|
| 244 |
|
| 245 |
const float2 dm2f = __half22float2(dm2);
|
| 246 |
|
| 247 |
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
|
|
|
|
|
|
|
|
|
| 248 |
}
|
| 249 |
|
| 250 |
// contiguous u/y values
|
| 251 |
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
| 252 |
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
|
| 253 |
|
|
|
|
| 254 |
float sumf_d = 0.0f;
|
| 255 |
float sumf_m = 0.0f;
|
| 256 |
|
|
|
|
| 264 |
#pragma unroll
|
| 265 |
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
| 266 |
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
|
| 267 |
+
sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product
|
| 268 |
+
sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m);
|
| 269 |
}
|
| 270 |
|
| 271 |
sumf_d += dm2f.x * sumi_d;
|
|
|
|
| 273 |
}
|
| 274 |
|
| 275 |
return d8*(sumf_d - sumf_m);
|
|
|
|
|
|
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
#define VDR_Q3_K_Q8_1_MMVQ 1
|
|
|
|
| 283 |
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
| 284 |
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
| 285 |
|
|
|
|
| 286 |
float sumf = 0.0f;
|
| 287 |
|
| 288 |
#pragma unroll
|
|
|
|
| 305 |
|
| 306 |
const int vi = __vsubss4(vil, vih);
|
| 307 |
|
| 308 |
+
sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
| 309 |
}
|
| 310 |
|
| 311 |
return d3 * sumf;
|
|
|
|
|
|
|
|
|
|
| 312 |
}
|
| 313 |
|
| 314 |
// contiguous u/y values
|
|
|
|
| 316 |
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
| 317 |
const float & d3, const float & d8) {
|
| 318 |
|
|
|
|
| 319 |
int sumi = 0;
|
| 320 |
|
| 321 |
#pragma unroll
|
|
|
|
| 325 |
#pragma unroll
|
| 326 |
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
| 327 |
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
|
| 328 |
+
sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product
|
| 329 |
}
|
| 330 |
|
| 331 |
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
| 332 |
}
|
| 333 |
|
| 334 |
return d3*d8 * sumi;
|
|
|
|
|
|
|
|
|
|
| 335 |
}
|
| 336 |
|
| 337 |
#define VDR_Q4_K_Q8_1_MMVQ 2
|
|
|
|
| 342 |
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 343 |
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
| 344 |
|
|
|
|
| 345 |
float sumf_d = 0.0f;
|
| 346 |
float sumf_m = 0.0f;
|
| 347 |
|
|
|
|
| 350 |
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
| 351 |
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
| 352 |
|
| 353 |
+
const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
| 354 |
+
const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
| 355 |
|
| 356 |
sumf_d += d8[i] * (dot1 * sc[i]);
|
| 357 |
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
|
|
|
| 360 |
const float2 dm4f = __half22float2(dm4);
|
| 361 |
|
| 362 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
}
|
| 364 |
|
| 365 |
// contiguous u/y values
|
|
|
|
| 367 |
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 368 |
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
| 369 |
|
|
|
|
| 370 |
float sumf_d = 0.0f;
|
| 371 |
float sumf_m = 0.0f;
|
| 372 |
|
|
|
|
| 376 |
|
| 377 |
#pragma unroll
|
| 378 |
for (int j = 0; j < QI8_1; ++j) {
|
| 379 |
+
sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
| 380 |
}
|
| 381 |
|
| 382 |
const float2 ds8f = __half22float2(ds8[i]);
|
|
|
|
| 388 |
const float2 dm4f = __half22float2(dm4);
|
| 389 |
|
| 390 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
}
|
| 392 |
|
| 393 |
#define VDR_Q5_K_Q8_1_MMVQ 2
|
|
|
|
| 398 |
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 399 |
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
| 400 |
|
|
|
|
| 401 |
float sumf_d = 0.0f;
|
| 402 |
float sumf_m = 0.0f;
|
| 403 |
|
|
|
|
| 412 |
const int v0i = vl0i | vh0i;
|
| 413 |
const int v1i = vl1i | vh1i;
|
| 414 |
|
| 415 |
+
const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
| 416 |
+
const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
| 417 |
|
| 418 |
sumf_d += d8[i] * (dot1 * sc[i]);
|
| 419 |
sumf_m += d8[i] * (dot2 * m[i]);
|
|
|
|
| 423 |
const float2 dm5f = __half22float2(dm5);
|
| 424 |
|
| 425 |
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
}
|
| 427 |
|
| 428 |
// contiguous u/y values
|
|
|
|
| 430 |
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
| 431 |
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
| 432 |
|
|
|
|
| 433 |
float sumf_d = 0.0f;
|
| 434 |
float sumf_m = 0.0f;
|
| 435 |
|
|
|
|
| 439 |
|
| 440 |
#pragma unroll
|
| 441 |
for (int j = 0; j < QI8_1; ++j) {
|
| 442 |
+
sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
| 443 |
}
|
| 444 |
|
| 445 |
const float2 ds8f = __half22float2(ds8[i]);
|
|
|
|
| 451 |
const float2 dm4f = __half22float2(dm4);
|
| 452 |
|
| 453 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
}
|
| 455 |
|
| 456 |
#define VDR_Q6_K_Q8_1_MMVQ 1
|
|
|
|
| 461 |
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
| 462 |
const float & d, const float * __restrict__ d8) {
|
| 463 |
|
|
|
|
| 464 |
float sumf = 0.0f;
|
| 465 |
|
| 466 |
#pragma unroll
|
|
|
|
| 473 |
|
| 474 |
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
| 475 |
|
| 476 |
+
sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
| 477 |
}
|
| 478 |
|
| 479 |
return d*sumf;
|
|
|
|
|
|
|
|
|
|
| 480 |
}
|
| 481 |
|
| 482 |
// contiguous u/y values
|
|
|
|
| 484 |
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
| 485 |
const float & d6, const float * __restrict__ d8) {
|
| 486 |
|
|
|
|
| 487 |
float sumf_d = 0.0f;
|
| 488 |
|
| 489 |
#pragma unroll
|
|
|
|
| 492 |
|
| 493 |
#pragma unroll
|
| 494 |
for (int i = i0; i < i0 + 2; ++i) {
|
| 495 |
+
sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
|
| 496 |
+
sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
|
| 497 |
|
| 498 |
+
sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
|
| 499 |
+
sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
| 500 |
}
|
| 501 |
|
| 502 |
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
| 503 |
}
|
| 504 |
|
| 505 |
return d6 * sumf_d;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
}
|
| 507 |
|
| 508 |
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
|
|
| 515 |
|
| 516 |
#pragma unroll
|
| 517 |
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
| 518 |
+
v[i] = get_int_b2(bq4_0->qs, iqs + i);
|
| 519 |
+
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
|
| 520 |
+
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_0);
|
| 521 |
}
|
| 522 |
|
| 523 |
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
|
|
|
| 534 |
|
| 535 |
#pragma unroll
|
| 536 |
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
|
| 537 |
+
v[i] = get_int_b4(bq4_1->qs, iqs + i);
|
| 538 |
+
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
|
| 539 |
+
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_1);
|
| 540 |
}
|
| 541 |
|
| 542 |
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
|
|
|
| 553 |
|
| 554 |
#pragma unroll
|
| 555 |
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
|
| 556 |
+
vl[i] = get_int_b2(bq5_0->qs, iqs + i);
|
| 557 |
+
vh[i] = get_int_b2(bq5_0->qh, 0) >> (4 * (iqs + i));
|
| 558 |
+
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
|
| 559 |
+
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_0);
|
| 560 |
}
|
| 561 |
|
| 562 |
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
|
|
|
| 573 |
|
| 574 |
#pragma unroll
|
| 575 |
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
|
| 576 |
+
vl[i] = get_int_b4(bq5_1->qs, iqs + i);
|
| 577 |
+
vh[i] = get_int_b4(bq5_1->qh, 0) >> (4 * (iqs + i));
|
| 578 |
+
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
|
| 579 |
+
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_1);
|
| 580 |
}
|
| 581 |
|
| 582 |
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
|
|
|
| 592 |
|
| 593 |
#pragma unroll
|
| 594 |
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
| 595 |
+
v[i] = get_int_b2(bq8_0->qs, iqs + i);
|
| 596 |
+
u[i] = get_int_b4(bq8_1->qs, iqs + i);
|
| 597 |
}
|
| 598 |
|
| 599 |
return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
|
|
|
| 609 |
|
| 610 |
const uint8_t * scales = bq2_K->scales + scale_offset;
|
| 611 |
|
| 612 |
+
const int v = get_int_b4(bq2_K->qs, iqs);
|
| 613 |
int u[QR2_K];
|
| 614 |
float d8[QR2_K];
|
| 615 |
|
| 616 |
#pragma unroll
|
| 617 |
for (int i = 0; i < QR2_K; ++ i) {
|
| 618 |
+
u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
| 619 |
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
| 620 |
}
|
| 621 |
|
|
|
|
| 632 |
|
| 633 |
const float d = bq3_K->d;
|
| 634 |
|
| 635 |
+
const int vl = get_int_b2(bq3_K->qs, iqs);
|
| 636 |
|
| 637 |
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
| 638 |
+
const int vh = ~get_int_b2(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
| 639 |
|
| 640 |
int u[QR3_K];
|
| 641 |
float d8[QR3_K];
|
| 642 |
|
| 643 |
#pragma unroll
|
| 644 |
for (int i = 0; i < QR3_K; ++i) {
|
| 645 |
+
u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
| 646 |
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
| 647 |
}
|
| 648 |
|
|
|
|
| 750 |
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
| 751 |
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
| 752 |
|
| 753 |
+
const int vl = get_int_b2(bq6_K->ql, iqs);
|
| 754 |
+
const int vh = get_int_b2(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
|
| 755 |
|
| 756 |
const int8_t * scales = bq6_K->scales + scale_offset;
|
| 757 |
|
|
|
|
| 760 |
|
| 761 |
#pragma unroll
|
| 762 |
for (int i = 0; i < QR6_K; ++i) {
|
| 763 |
+
u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
| 764 |
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
|
| 765 |
}
|
| 766 |
|
| 767 |
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
| 768 |
}
|
| 769 |
|
| 770 |
+
#define VDR_IQ2_XXS_Q8_1_MMVQ 2
|
| 771 |
+
|
| 772 |
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
| 773 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 774 |
+
|
| 775 |
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
|
| 776 |
|
| 777 |
+
const int q2 = get_int_b2(bq2->qs, iqs);
|
| 778 |
+
const uint8_t * aux8 = (const uint8_t *) &q2;
|
| 779 |
+
const uint32_t aux32 = get_int_b2(bq2->qs, iqs + 1);
|
| 780 |
+
|
|
|
|
|
|
|
| 781 |
int sumi = 0;
|
| 782 |
+
#pragma unroll
|
| 783 |
+
for (int k0 = 0; k0 < 8; k0 += 2) {
|
| 784 |
+
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
|
| 785 |
+
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
|
| 786 |
+
|
| 787 |
+
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
|
| 788 |
+
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
|
| 789 |
+
const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
|
| 790 |
+
sumi = ggml_cuda_dp4a(grid0, u0, sumi);
|
| 791 |
+
|
| 792 |
+
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
|
| 793 |
+
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
|
| 794 |
+
const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
|
| 795 |
+
sumi = ggml_cuda_dp4a(grid1, u1, sumi);
|
| 796 |
}
|
| 797 |
+
|
| 798 |
+
const int ls = aux32 >> 28;
|
| 799 |
+
sumi = (ls*sumi + sumi/2)/4;
|
| 800 |
+
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
| 801 |
return d * sumi;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
}
|
| 803 |
|
| 804 |
+
#define VDR_IQ2_XS_Q8_1_MMVQ 2
|
| 805 |
+
|
| 806 |
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
| 807 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 808 |
+
|
| 809 |
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
|
| 810 |
|
| 811 |
+
const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
|
| 812 |
+
const uint16_t * q2 = (const uint16_t *) &q2_packed;
|
| 813 |
+
const int ls0 = bq2->scales[iqs/2] & 0x0F;
|
| 814 |
+
const int ls1 = bq2->scales[iqs/2] >> 4;
|
| 815 |
+
|
| 816 |
+
int sumi0 = 0;
|
| 817 |
int sumi1 = 0;
|
| 818 |
+
#pragma unroll
|
| 819 |
+
for (int l0 = 0; l0 < 8; l0 += 2) {
|
| 820 |
+
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
|
| 821 |
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
|
| 822 |
+
|
| 823 |
+
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
|
| 824 |
+
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
|
| 825 |
+
|
| 826 |
+
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
|
| 827 |
+
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
|
| 828 |
+
|
| 829 |
+
if (l0 < 4) {
|
| 830 |
+
sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
|
| 831 |
+
sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
|
| 832 |
+
} else {
|
| 833 |
+
sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
|
| 834 |
+
sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
|
| 835 |
+
}
|
| 836 |
}
|
| 837 |
+
const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
|
| 838 |
+
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
| 839 |
+
return d * sumi;
|
|
|
|
|
|
|
|
|
|
| 840 |
}
|
| 841 |
|
| 842 |
+
#define VDR_IQ2_S_Q8_1_MMVQ 2
|
| 843 |
+
|
| 844 |
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
| 845 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 846 |
+
|
| 847 |
const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
|
| 848 |
|
| 849 |
+
const int qs_packed = get_int_b2(bq2->qs, iqs/2);
|
| 850 |
+
const uint8_t * qs = (const uint8_t *) &qs_packed;
|
| 851 |
+
|
| 852 |
+
const int qh = bq2->qh[iqs/2];
|
| 853 |
+
|
| 854 |
+
const int signs_packed_32 = get_int_b2(bq2->qs, QK_K/32 + iqs/2);
|
| 855 |
+
const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
|
| 856 |
+
|
| 857 |
+
const int ls0 = bq2->scales[iqs/2] & 0x0F;
|
| 858 |
+
const int ls1 = bq2->scales[iqs/2] >> 4;
|
| 859 |
+
|
| 860 |
+
int sumi0 = 0;
|
| 861 |
int sumi1 = 0;
|
| 862 |
+
#pragma unroll
|
| 863 |
+
for (int l0 = 0; l0 < 8; l0 += 2) {
|
| 864 |
+
const int * grid_pos = (const int *)(iq2s_grid + (qs[l0/2] | ((qh << (8-l0)) & 0x300)));
|
| 865 |
+
|
| 866 |
+
const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
|
| 867 |
+
const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
|
| 868 |
+
|
| 869 |
+
const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
|
| 870 |
+
const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
|
| 871 |
+
|
| 872 |
+
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
|
| 873 |
+
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
|
| 874 |
+
|
| 875 |
+
if (l0 < 4) {
|
| 876 |
+
sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
|
| 877 |
+
sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
|
| 878 |
+
} else {
|
| 879 |
+
sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
|
| 880 |
+
sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
|
| 881 |
+
}
|
| 882 |
}
|
| 883 |
+
const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
|
| 884 |
+
|
| 885 |
+
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
| 886 |
+
return d * sumi;
|
|
|
|
|
|
|
| 887 |
}
|
| 888 |
|
| 889 |
+
#define VDR_IQ3_XXS_Q8_1_MMVQ 2
|
| 890 |
+
|
| 891 |
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
| 892 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 893 |
+
|
| 894 |
+
const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
|
| 895 |
+
|
| 896 |
+
const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
|
| 897 |
+
const uint8_t * q3 = (const uint8_t *) &q3_packed;
|
| 898 |
+
const uint32_t aux32 = get_int_b2(bq3->qs, QK_K/16 + iqs/2);
|
| 899 |
+
|
|
|
|
| 900 |
int sumi = 0;
|
| 901 |
+
#pragma unroll
|
| 902 |
+
for (int l0 = 0; l0 < 8; l0 += 2) {
|
| 903 |
+
const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
|
| 904 |
+
|
| 905 |
+
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
|
| 906 |
+
|
| 907 |
+
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
|
| 908 |
+
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
|
| 909 |
+
|
| 910 |
+
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
|
| 911 |
+
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
|
| 912 |
+
|
| 913 |
+
sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
|
| 914 |
+
sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
|
| 915 |
}
|
| 916 |
+
|
| 917 |
+
const int ls = aux32 >> 28;
|
| 918 |
+
sumi = (ls*sumi + sumi/2)/2;
|
| 919 |
+
const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
|
| 920 |
return d * sumi;
|
|
|
|
|
|
|
|
|
|
| 921 |
}
|
| 922 |
|
| 923 |
+
#define VDR_IQ3_S_Q8_1_MMVQ 2
|
| 924 |
+
|
| 925 |
// TODO: don't use lookup table for signs
|
| 926 |
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
| 927 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
|
|
|
|
|
|
| 928 |
|
| 929 |
+
const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
|
| 930 |
+
|
| 931 |
+
const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
|
| 932 |
+
const uint8_t * qs = (const uint8_t *) &qs_packed;
|
| 933 |
+
|
| 934 |
+
const int qh = bq3->qh[iqs/2];
|
| 935 |
+
|
| 936 |
+
const int signs_packed_32 = get_int_b2(bq3->signs, iqs/2);
|
| 937 |
+
const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
|
| 938 |
+
|
| 939 |
int sumi = 0;
|
| 940 |
+
#pragma unroll
|
| 941 |
+
for (int l0 = 0; l0 < 8; l0 += 2) {
|
| 942 |
+
const int2 grid_pos = make_int2(
|
| 943 |
+
iq3s_grid[qs[l0 + 0] | ((qh << (8 - l0)) & 0x100)],
|
| 944 |
+
iq3s_grid[qs[l0 + 1] | ((qh << (7 - l0)) & 0x100)]);
|
| 945 |
+
|
| 946 |
+
const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
|
| 947 |
+
const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
|
| 948 |
+
|
| 949 |
+
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
|
| 950 |
+
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
|
| 951 |
+
|
| 952 |
+
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
|
| 953 |
+
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
|
| 954 |
+
|
| 955 |
+
sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
|
| 956 |
+
sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
|
| 957 |
}
|
| 958 |
+
|
| 959 |
+
sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F);
|
| 960 |
+
|
| 961 |
+
const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
|
| 962 |
return d * sumi;
|
|
|
|
|
|
|
|
|
|
| 963 |
}
|
| 964 |
|
| 965 |
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
| 966 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 967 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
|
| 968 |
|
| 969 |
+
const int qs_packed = get_int_b2(bq1->qs, iqs);
|
| 970 |
+
const uint8_t * qs = (const uint8_t *) &qs_packed;
|
| 971 |
+
|
| 972 |
+
const int qh = bq1->qh[iqs];
|
| 973 |
+
|
| 974 |
int sumi = 0;
|
| 975 |
+
#pragma unroll
|
| 976 |
+
for (int l0 = 0; l0 < 8; l0 += 2) {
|
| 977 |
+
const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
|
| 978 |
+
|
| 979 |
+
const int grid0 = (grid >> 0) & 0x0F0F0F0F;
|
| 980 |
+
const int grid1 = (grid >> 4) & 0x0F0F0F0F;
|
| 981 |
+
|
| 982 |
+
const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
|
| 983 |
+
const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
|
| 984 |
+
|
| 985 |
+
sumi = ggml_cuda_dp4a(grid0, u0, sumi);
|
| 986 |
+
sumi = ggml_cuda_dp4a(grid1, u1, sumi);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
}
|
| 988 |
+
|
| 989 |
+
const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
|
| 990 |
+
const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
|
| 991 |
+
const float2 ds = __half22float2(bq8_1[iqs].ds);
|
| 992 |
+
return d1q * (ds.x*sumi + ds.y*delta);
|
|
|
|
| 993 |
}
|
| 994 |
|
| 995 |
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
| 996 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 997 |
+
|
| 998 |
const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
|
| 999 |
|
| 1000 |
+
const int qs_packed = get_int_b4(bq1->qs, iqs);
|
| 1001 |
+
const uint8_t * qs = (const uint8_t *) &qs_packed;
|
| 1002 |
+
|
| 1003 |
+
int sumi[2] = {0};
|
| 1004 |
+
float sumf[2] = {0.0f};
|
| 1005 |
+
#pragma unroll
|
| 1006 |
+
for (int l0 = 0; l0 < 8; l0 += 2) {
|
| 1007 |
+
const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
|
| 1008 |
+
|
| 1009 |
+
const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
|
| 1010 |
+
|
| 1011 |
+
const int grid0 = (grid >> 0) & 0x0F0F0F0F;
|
| 1012 |
+
const int grid1 = (grid >> 4) & 0x0F0F0F0F;
|
| 1013 |
+
|
| 1014 |
+
const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
|
| 1015 |
+
const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
|
| 1016 |
+
|
| 1017 |
+
sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]);
|
| 1018 |
+
sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]);
|
| 1019 |
+
|
| 1020 |
+
const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
|
| 1021 |
int sumy = 0;
|
| 1022 |
+
sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy);
|
| 1023 |
+
sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy);
|
| 1024 |
+
sumf[l0/4] += delta*sumy;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
}
|
| 1026 |
+
|
| 1027 |
+
const uint16_t * sc = (const uint16_t *) bq1->scales;
|
| 1028 |
+
|
| 1029 |
iq1m_scale_t scale;
|
| 1030 |
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
|
| 1031 |
+
const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
|
| 1032 |
+
|
| 1033 |
+
const int tmp = sc[iqs/2] >> (6*(iqs%2));
|
| 1034 |
+
const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
|
| 1035 |
+
const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
|
| 1036 |
+
return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
|
| 1037 |
}
|
| 1038 |
|
| 1039 |
+
static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
|
| 1040 |
+
const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
|
| 1041 |
+
const int8_t * q0_8 = (const int8_t *) &q0_32;
|
| 1042 |
+
const char4 val0_8 = make_char4(
|
| 1043 |
+
kvalues_iq4nl[q0_8[0]], kvalues_iq4nl[q0_8[1]], kvalues_iq4nl[q0_8[2]], kvalues_iq4nl[q0_8[3]]);
|
| 1044 |
+
|
| 1045 |
+
const int q1_32 = (q4 >> 4) & 0x0F0F0F0F;
|
| 1046 |
+
const int8_t * q1_8 = (const int8_t *) &q1_32;
|
| 1047 |
+
const char4 val1_8 = make_char4(
|
| 1048 |
+
kvalues_iq4nl[q1_8[0]], kvalues_iq4nl[q1_8[1]], kvalues_iq4nl[q1_8[2]], kvalues_iq4nl[q1_8[3]]);
|
| 1049 |
+
|
| 1050 |
+
return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
|
|
|
|
| 1051 |
}
|
| 1052 |
+
|
| 1053 |
+
#define VDR_IQ4_NL_Q8_1_MMVQ 2
|
| 1054 |
|
| 1055 |
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
| 1056 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 1057 |
|
| 1058 |
+
const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
|
| 1059 |
|
| 1060 |
+
const int * q8 = (const int *) bq8_1->qs + iqs;
|
|
|
|
|
|
|
| 1061 |
|
| 1062 |
+
int sumi = 0;
|
| 1063 |
+
#pragma unroll
|
|
|
|
|
|
|
| 1064 |
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
| 1065 |
+
const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
|
| 1066 |
+
const int2 v = get_int_from_table_16(aux_q4);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
|
| 1068 |
+
sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
|
| 1069 |
+
sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
|
|
|
|
|
|
|
| 1070 |
}
|
| 1071 |
+
|
| 1072 |
+
const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
|
| 1073 |
+
return d * sumi;
|
| 1074 |
}
|
| 1075 |
|
| 1076 |
+
#define VDR_IQ4_XS_Q8_1_MMVQ 4
|
| 1077 |
+
|
| 1078 |
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
| 1079 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
| 1080 |
|
|
|
|
| 1081 |
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
|
| 1082 |
+
|
| 1083 |
+
int sumi = 0;
|
| 1084 |
+
#pragma unroll
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1085 |
for (int j = 0; j < 4; ++j) {
|
| 1086 |
+
const int aux_q4 = get_int_b4(bq4->qs, iqs + j);
|
| 1087 |
+
const int2 v = get_int_from_table_16(aux_q4);
|
| 1088 |
+
|
| 1089 |
+
const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0);
|
| 1090 |
+
const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4);
|
| 1091 |
+
|
| 1092 |
+
sumi = ggml_cuda_dp4a(v.x, u0, sumi);
|
| 1093 |
+
sumi = ggml_cuda_dp4a(v.y, u1, sumi);
|
| 1094 |
}
|
| 1095 |
+
|
| 1096 |
+
const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);
|
| 1097 |
+
sumi *= ls - 32;
|
| 1098 |
+
|
| 1099 |
+
const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
|
| 1100 |
+
return d * sumi;
|
| 1101 |
}
|