Spaces:
Running
Running
Commit
·
bd93c1b
1
Parent(s):
d4f6b2c
vulkan: implement initial support for IQ2 and IQ3 quantizations (llama/11360)
Browse files* vulkan: initial support for IQ3_S
* vulkan: initial support for IQ3_XXS
* vulkan: initial support for IQ2_XXS
* vulkan: initial support for IQ2_XS
* vulkan: optimize Q3_K by removing branches
* vulkan: implement dequantize variants for coopmat2
* vulkan: initial support for IQ2_S
* vulkan: vertically realign code
* port failing dequant callbacks from mul_mm
* Fix array length mismatches
* vulkan: avoid using workgroup size before it is referenced
* tests: increase timeout for Vulkan llvmpipe backend
---------
Co-authored-by: Jeff Bolz <[email protected]>
- ggml/src/ggml-vulkan/ggml-vulkan.cpp +141 -16
- ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +217 -1
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +164 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +116 -6
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +2 -2
- ggml/src/ggml-vulkan/vulkan-shaders/types.comp +735 -3
- ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +5 -0
ggml/src/ggml-vulkan/ggml-vulkan.cpp
CHANGED
|
@@ -1616,6 +1616,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1616 |
//CREATE_FA(GGML_TYPE_Q4_K, q4_k)
|
| 1617 |
//CREATE_FA(GGML_TYPE_Q5_K, q5_k)
|
| 1618 |
//CREATE_FA(GGML_TYPE_Q6_K, q6_k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1619 |
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
|
| 1620 |
#undef CREATE_FA
|
| 1621 |
|
|
@@ -1644,7 +1649,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1644 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
| 1645 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
| 1646 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
| 1647 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1648 |
|
| 1649 |
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
| 1650 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
|
@@ -1657,7 +1667,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1657 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1658 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1659 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1660 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat_id[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1661 |
#undef CREATE_MM
|
| 1662 |
#undef CREATE_MM2
|
| 1663 |
} else
|
|
@@ -1705,7 +1720,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1705 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1706 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1707 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1708 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1709 |
} else {
|
| 1710 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1711 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
@@ -1718,7 +1738,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1718 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1719 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1720 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1721 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1722 |
}
|
| 1723 |
|
| 1724 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
|
@@ -1739,7 +1764,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1739 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1740 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1741 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1742 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat_id[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1743 |
} else {
|
| 1744 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1745 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
@@ -1752,7 +1782,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1752 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1753 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1754 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1755 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat_id[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1756 |
}
|
| 1757 |
}
|
| 1758 |
#undef CREATE_MM2
|
|
@@ -1796,7 +1831,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1796 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1797 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1798 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1799 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1800 |
|
| 1801 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
| 1802 |
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
|
@@ -1815,7 +1855,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1815 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1816 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1817 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1818 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat_id[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1819 |
}
|
| 1820 |
#undef CREATE_MM2
|
| 1821 |
#undef CREATE_MM
|
|
@@ -1851,7 +1896,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1851 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1852 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1853 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1854 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1855 |
|
| 1856 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
| 1857 |
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
|
@@ -1870,7 +1920,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1870 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1871 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1872 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1873 |
-
CREATE_MM(pipeline_dequant_mul_mat_mat_id[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1874 |
}
|
| 1875 |
#undef CREATE_MM
|
| 1876 |
}
|
|
@@ -1901,7 +1956,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1901 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1902 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1903 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1904 |
-
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1905 |
|
| 1906 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
| 1907 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
|
@@ -1915,7 +1975,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1915 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1916 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1917 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1918 |
-
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1919 |
}
|
| 1920 |
|
| 1921 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
|
@@ -1930,7 +1995,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1930 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1931 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1932 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1933 |
-
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1934 |
|
| 1935 |
// dequant shaders
|
| 1936 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
@@ -1944,7 +2014,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1944 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 1945 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1946 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1947 |
-
ggml_vk_create_pipeline(device, device->pipeline_dequant[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1948 |
|
| 1949 |
// get_rows
|
| 1950 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
@@ -1954,7 +2029,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1954 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1955 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1956 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1957 |
-
ggml_vk_create_pipeline(device, device->pipeline_get_rows[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1958 |
|
| 1959 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1960 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
@@ -1963,7 +2043,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1963 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1964 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1965 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1966 |
-
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1967 |
|
| 1968 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
| 1969 |
|
|
@@ -2890,6 +2975,11 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
| 2890 |
case GGML_TYPE_Q4_K:
|
| 2891 |
case GGML_TYPE_Q5_K:
|
| 2892 |
case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2893 |
case GGML_TYPE_IQ4_NL:
|
| 2894 |
break;
|
| 2895 |
default:
|
|
@@ -2938,6 +3028,11 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
| 2938 |
case GGML_TYPE_Q4_K:
|
| 2939 |
case GGML_TYPE_Q5_K:
|
| 2940 |
case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2941 |
case GGML_TYPE_IQ4_NL:
|
| 2942 |
break;
|
| 2943 |
default:
|
|
@@ -2969,6 +3064,11 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
| 2969 |
case GGML_TYPE_Q4_K:
|
| 2970 |
case GGML_TYPE_Q5_K:
|
| 2971 |
case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2972 |
case GGML_TYPE_IQ4_NL:
|
| 2973 |
break;
|
| 2974 |
default:
|
|
@@ -3012,6 +3112,11 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
| 3012 |
case GGML_TYPE_Q4_K:
|
| 3013 |
case GGML_TYPE_Q5_K:
|
| 3014 |
case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3015 |
case GGML_TYPE_IQ4_NL:
|
| 3016 |
break;
|
| 3017 |
default:
|
|
@@ -3038,6 +3143,11 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
| 3038 |
case GGML_TYPE_Q4_K:
|
| 3039 |
case GGML_TYPE_Q5_K:
|
| 3040 |
case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3041 |
case GGML_TYPE_IQ4_NL:
|
| 3042 |
break;
|
| 3043 |
default:
|
|
@@ -7907,6 +8017,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 7907 |
case GGML_TYPE_Q4_K:
|
| 7908 |
case GGML_TYPE_Q5_K:
|
| 7909 |
case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7910 |
case GGML_TYPE_IQ4_NL:
|
| 7911 |
break;
|
| 7912 |
default:
|
|
@@ -7975,6 +8090,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 7975 |
//case GGML_TYPE_Q4_K:
|
| 7976 |
//case GGML_TYPE_Q5_K:
|
| 7977 |
//case GGML_TYPE_Q6_K:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7978 |
case GGML_TYPE_IQ4_NL:
|
| 7979 |
break;
|
| 7980 |
default:
|
|
@@ -7992,6 +8112,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 7992 |
case GGML_TYPE_Q5_0:
|
| 7993 |
case GGML_TYPE_Q5_1:
|
| 7994 |
case GGML_TYPE_Q8_0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7995 |
case GGML_TYPE_IQ4_NL:
|
| 7996 |
return true;
|
| 7997 |
default:
|
|
|
|
| 1616 |
//CREATE_FA(GGML_TYPE_Q4_K, q4_k)
|
| 1617 |
//CREATE_FA(GGML_TYPE_Q5_K, q5_k)
|
| 1618 |
//CREATE_FA(GGML_TYPE_Q6_K, q6_k)
|
| 1619 |
+
//CREATE_FA(GGML_TYPE_IQ2_XXS, iq2_xxs)
|
| 1620 |
+
//CREATE_FA(GGML_TYPE_IQ2_XS, iq2_xs)
|
| 1621 |
+
//CREATE_FA(GGML_TYPE_IQ2_S, iq2_s)
|
| 1622 |
+
//CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs)
|
| 1623 |
+
//CREATE_FA(GGML_TYPE_IQ3_S, iq3_s)
|
| 1624 |
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
|
| 1625 |
#undef CREATE_FA
|
| 1626 |
|
|
|
|
| 1649 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
| 1650 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
| 1651 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
| 1652 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1653 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1654 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1655 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1656 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1657 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1658 |
|
| 1659 |
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
| 1660 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
|
|
|
| 1667 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1668 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1669 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1670 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1671 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1672 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1673 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1674 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1675 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1676 |
#undef CREATE_MM
|
| 1677 |
#undef CREATE_MM2
|
| 1678 |
} else
|
|
|
|
| 1720 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1721 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1722 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1723 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1724 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1725 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1726 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1727 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1728 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1729 |
} else {
|
| 1730 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1731 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
|
|
| 1738 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1739 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1740 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1741 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1742 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1743 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1744 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1745 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1746 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1747 |
}
|
| 1748 |
|
| 1749 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
|
|
|
| 1764 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1765 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1766 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1767 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1768 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1769 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1770 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1771 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1772 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1773 |
} else {
|
| 1774 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1775 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
|
|
| 1782 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1783 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1784 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1785 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1786 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1787 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1788 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1789 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1790 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1791 |
}
|
| 1792 |
}
|
| 1793 |
#undef CREATE_MM2
|
|
|
|
| 1831 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1832 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1833 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1834 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1835 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1836 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1837 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1838 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1839 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1840 |
|
| 1841 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
| 1842 |
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
|
|
|
| 1855 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1856 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1857 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1858 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1859 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1860 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1861 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1862 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1863 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1864 |
}
|
| 1865 |
#undef CREATE_MM2
|
| 1866 |
#undef CREATE_MM
|
|
|
|
| 1896 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1897 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1898 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1899 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1900 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1901 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1902 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1903 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1904 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1905 |
|
| 1906 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
| 1907 |
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
|
|
|
| 1920 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1921 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1922 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1923 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1924 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1925 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1926 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1927 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1928 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1929 |
}
|
| 1930 |
#undef CREATE_MM
|
| 1931 |
}
|
|
|
|
| 1956 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1957 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1958 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1959 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1960 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f32_f32_len, mul_mat_vec_iq2_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1961 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1962 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1963 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1964 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
| 1965 |
|
| 1966 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
| 1967 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
|
|
|
| 1975 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1976 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1977 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1978 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1979 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f16_f32_len, mul_mat_vec_iq2_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1980 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1981 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1982 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1983 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
| 1984 |
}
|
| 1985 |
|
| 1986 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
|
|
|
| 1995 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1996 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1997 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1998 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 1999 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2000 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2001 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2002 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2003 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
| 2004 |
|
| 2005 |
// dequant shaders
|
| 2006 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
|
|
| 2014 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2015 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 2016 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 2017 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XXS], "dequant_iq2_xxs", dequant_iq2_xxs_len, dequant_iq2_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2018 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XS], "dequant_iq2_xs", dequant_iq2_xs_len, dequant_iq2_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2019 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2020 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2021 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2022 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
| 2023 |
|
| 2024 |
// get_rows
|
| 2025 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
|
|
| 2029 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2030 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2031 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2032 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs", get_rows_iq2_xxs_len, get_rows_iq2_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2033 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs", get_rows_iq2_xs_len, get_rows_iq2_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2034 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2035 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2036 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2037 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2038 |
|
| 2039 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 2040 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
|
|
| 2043 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2044 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2045 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2046 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs_f32", get_rows_iq2_xxs_f32_len, get_rows_iq2_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2047 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs_f32", get_rows_iq2_xs_f32_len, get_rows_iq2_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2048 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2049 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2050 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2051 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2052 |
|
| 2053 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
| 2054 |
|
|
|
|
| 2975 |
case GGML_TYPE_Q4_K:
|
| 2976 |
case GGML_TYPE_Q5_K:
|
| 2977 |
case GGML_TYPE_Q6_K:
|
| 2978 |
+
case GGML_TYPE_IQ2_XXS:
|
| 2979 |
+
case GGML_TYPE_IQ2_XS:
|
| 2980 |
+
case GGML_TYPE_IQ2_S:
|
| 2981 |
+
case GGML_TYPE_IQ3_XXS:
|
| 2982 |
+
case GGML_TYPE_IQ3_S:
|
| 2983 |
case GGML_TYPE_IQ4_NL:
|
| 2984 |
break;
|
| 2985 |
default:
|
|
|
|
| 3028 |
case GGML_TYPE_Q4_K:
|
| 3029 |
case GGML_TYPE_Q5_K:
|
| 3030 |
case GGML_TYPE_Q6_K:
|
| 3031 |
+
case GGML_TYPE_IQ2_XXS:
|
| 3032 |
+
case GGML_TYPE_IQ2_XS:
|
| 3033 |
+
case GGML_TYPE_IQ2_S:
|
| 3034 |
+
case GGML_TYPE_IQ3_XXS:
|
| 3035 |
+
case GGML_TYPE_IQ3_S:
|
| 3036 |
case GGML_TYPE_IQ4_NL:
|
| 3037 |
break;
|
| 3038 |
default:
|
|
|
|
| 3064 |
case GGML_TYPE_Q4_K:
|
| 3065 |
case GGML_TYPE_Q5_K:
|
| 3066 |
case GGML_TYPE_Q6_K:
|
| 3067 |
+
case GGML_TYPE_IQ2_XXS:
|
| 3068 |
+
case GGML_TYPE_IQ2_XS:
|
| 3069 |
+
case GGML_TYPE_IQ2_S:
|
| 3070 |
+
case GGML_TYPE_IQ3_XXS:
|
| 3071 |
+
case GGML_TYPE_IQ3_S:
|
| 3072 |
case GGML_TYPE_IQ4_NL:
|
| 3073 |
break;
|
| 3074 |
default:
|
|
|
|
| 3112 |
case GGML_TYPE_Q4_K:
|
| 3113 |
case GGML_TYPE_Q5_K:
|
| 3114 |
case GGML_TYPE_Q6_K:
|
| 3115 |
+
case GGML_TYPE_IQ2_XXS:
|
| 3116 |
+
case GGML_TYPE_IQ2_XS:
|
| 3117 |
+
case GGML_TYPE_IQ2_S:
|
| 3118 |
+
case GGML_TYPE_IQ3_XXS:
|
| 3119 |
+
case GGML_TYPE_IQ3_S:
|
| 3120 |
case GGML_TYPE_IQ4_NL:
|
| 3121 |
break;
|
| 3122 |
default:
|
|
|
|
| 3143 |
case GGML_TYPE_Q4_K:
|
| 3144 |
case GGML_TYPE_Q5_K:
|
| 3145 |
case GGML_TYPE_Q6_K:
|
| 3146 |
+
case GGML_TYPE_IQ2_XXS:
|
| 3147 |
+
case GGML_TYPE_IQ2_XS:
|
| 3148 |
+
case GGML_TYPE_IQ2_S:
|
| 3149 |
+
case GGML_TYPE_IQ3_XXS:
|
| 3150 |
+
case GGML_TYPE_IQ3_S:
|
| 3151 |
case GGML_TYPE_IQ4_NL:
|
| 3152 |
break;
|
| 3153 |
default:
|
|
|
|
| 8017 |
case GGML_TYPE_Q4_K:
|
| 8018 |
case GGML_TYPE_Q5_K:
|
| 8019 |
case GGML_TYPE_Q6_K:
|
| 8020 |
+
case GGML_TYPE_IQ2_XXS:
|
| 8021 |
+
case GGML_TYPE_IQ2_XS:
|
| 8022 |
+
case GGML_TYPE_IQ2_S:
|
| 8023 |
+
case GGML_TYPE_IQ3_XXS:
|
| 8024 |
+
case GGML_TYPE_IQ3_S:
|
| 8025 |
case GGML_TYPE_IQ4_NL:
|
| 8026 |
break;
|
| 8027 |
default:
|
|
|
|
| 8090 |
//case GGML_TYPE_Q4_K:
|
| 8091 |
//case GGML_TYPE_Q5_K:
|
| 8092 |
//case GGML_TYPE_Q6_K:
|
| 8093 |
+
//case GGML_TYPE_IQ2_XXS:
|
| 8094 |
+
//case GGML_TYPE_IQ2_XS:
|
| 8095 |
+
//case GGML_TYPE_IQ2_S:
|
| 8096 |
+
//case GGML_TYPE_IQ3_XXS:
|
| 8097 |
+
//case GGML_TYPE_IQ3_S:
|
| 8098 |
case GGML_TYPE_IQ4_NL:
|
| 8099 |
break;
|
| 8100 |
default:
|
|
|
|
| 8112 |
case GGML_TYPE_Q5_0:
|
| 8113 |
case GGML_TYPE_Q5_1:
|
| 8114 |
case GGML_TYPE_Q8_0:
|
| 8115 |
+
case GGML_TYPE_IQ2_XXS:
|
| 8116 |
+
case GGML_TYPE_IQ2_XS:
|
| 8117 |
+
case GGML_TYPE_IQ2_S:
|
| 8118 |
+
case GGML_TYPE_IQ3_XXS:
|
| 8119 |
+
case GGML_TYPE_IQ3_S:
|
| 8120 |
case GGML_TYPE_IQ4_NL:
|
| 8121 |
return true;
|
| 8122 |
default:
|
ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
CHANGED
|
@@ -12,8 +12,8 @@ layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
void main() {
|
| 15 |
-
#if defined(DATA_A_IQ4_NL)
|
| 16 |
-
|
| 17 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 18 |
return;
|
| 19 |
}
|
|
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
void main() {
|
| 15 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 16 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 17 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 18 |
return;
|
| 19 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
CHANGED
|
@@ -217,8 +217,8 @@ void quantize(uint dst_idx, uint src_idx)
|
|
| 217 |
#endif
|
| 218 |
|
| 219 |
void main() {
|
| 220 |
-
#if defined(DATA_A_IQ4_NL)
|
| 221 |
-
|
| 222 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 223 |
return;
|
| 224 |
}
|
|
|
|
| 217 |
#endif
|
| 218 |
|
| 219 |
void main() {
|
| 220 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 221 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 222 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 223 |
return;
|
| 224 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
CHANGED
|
@@ -88,6 +88,222 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
|
| 88 |
}
|
| 89 |
#endif
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
#if defined(DATA_A_IQ4_NL)
|
| 92 |
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 93 |
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
|
@@ -105,7 +321,7 @@ vec2 get_dm(uint ib, uint a_offset) {
|
|
| 105 |
}
|
| 106 |
#endif
|
| 107 |
|
| 108 |
-
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)
|
| 109 |
vec2 get_dm(uint ib, uint a_offset) {
|
| 110 |
return vec2(float(data_a[a_offset + ib].d), 0);
|
| 111 |
}
|
|
|
|
| 88 |
}
|
| 89 |
#endif
|
| 90 |
|
| 91 |
+
#if defined(DATA_A_IQ2_XXS)
|
| 92 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 93 |
+
const uint ib32 = iqs / 32;
|
| 94 |
+
const uint ib8 = (iqs / 8) % 4;
|
| 95 |
+
const uint qs = data_a[a_offset + ib].qs[8 * ib32 + ib8];
|
| 96 |
+
// Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
|
| 97 |
+
const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[4 * ib32 + 2],
|
| 98 |
+
data_a_packed16[a_offset + ib].qs[4 * ib32 + 3]));
|
| 99 |
+
const float db = 0.25 * (0.5 + (signs >> 28));
|
| 100 |
+
const uint sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
|
| 101 |
+
// Add parity bit
|
| 102 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 103 |
+
const uint sign = sign8 >> (iqs % 8);
|
| 104 |
+
const u8vec4 grid = unpack8(iq2xxs_grid[qs][(iqs % 8) / 4] >> (8 * (iqs % 4)));
|
| 105 |
+
bool sign0 = (sign & 1) != 0;
|
| 106 |
+
bool sign1 = (sign & 2) != 0;
|
| 107 |
+
return db * vec2(
|
| 108 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 109 |
+
grid.y * (sign1 ? -1.0 : 1.0)
|
| 110 |
+
);
|
| 111 |
+
}
|
| 112 |
+
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
| 113 |
+
const uint ib32 = iqs / 32;
|
| 114 |
+
const uint ib8 = (iqs / 8) % 4;
|
| 115 |
+
const uint qs = data_a[a_offset + ib].qs[8 * ib32 + ib8];
|
| 116 |
+
// Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
|
| 117 |
+
const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[4 * ib32 + 2],
|
| 118 |
+
data_a_packed16[a_offset + ib].qs[4 * ib32 + 3]));
|
| 119 |
+
const float db = 0.25 * (0.5 + (signs >> 28));
|
| 120 |
+
const uint sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
|
| 121 |
+
// Add parity bit
|
| 122 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 123 |
+
const uint sign = sign8 >> (iqs % 8);
|
| 124 |
+
const u8vec4 grid = unpack8(iq2xxs_grid[qs][(iqs % 8) / 4] >> (8 * (iqs % 4)));
|
| 125 |
+
bool sign0 = (sign & 1) != 0;
|
| 126 |
+
bool sign1 = (sign & 2) != 0;
|
| 127 |
+
bool sign2 = (sign & 4) != 0;
|
| 128 |
+
bool sign3 = (sign & 8) != 0;
|
| 129 |
+
return db * vec4(
|
| 130 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 131 |
+
grid.y * (sign1 ? -1.0 : 1.0),
|
| 132 |
+
grid.z * (sign2 ? -1.0 : 1.0),
|
| 133 |
+
grid.w * (sign3 ? -1.0 : 1.0)
|
| 134 |
+
);
|
| 135 |
+
}
|
| 136 |
+
#endif
|
| 137 |
+
|
| 138 |
+
#if defined(DATA_A_IQ2_XS)
|
| 139 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 140 |
+
const uint scale = (data_a[a_offset + ib].scales[iqs / 32] >> (4 * ((iqs / 16) & 1))) & 0xf;
|
| 141 |
+
const uint qs = data_a[a_offset + ib].qs[iqs / 8];
|
| 142 |
+
const float db = 0.25 * (0.5 + scale);
|
| 143 |
+
const uint sign7 = qs >> 9;
|
| 144 |
+
// Add parity bit
|
| 145 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 146 |
+
const uint sign = sign8 >> (iqs % 8);
|
| 147 |
+
const u8vec4 grid = unpack8(iq2xs_grid[qs & 511][(iqs % 8) / 4] >> (8 * (iqs % 4)));
|
| 148 |
+
bool sign0 = (sign & 1) != 0;
|
| 149 |
+
bool sign1 = (sign & 2) != 0;
|
| 150 |
+
return db * vec2(
|
| 151 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 152 |
+
grid.y * (sign1 ? -1.0 : 1.0)
|
| 153 |
+
);
|
| 154 |
+
}
|
| 155 |
+
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
| 156 |
+
const uint scale = (data_a[a_offset + ib].scales[iqs / 32] >> (4 * ((iqs / 16) & 1))) & 0xf;
|
| 157 |
+
const uint qs = data_a[a_offset + ib].qs[iqs / 8];
|
| 158 |
+
const float db = 0.25 * (0.5 + scale);
|
| 159 |
+
const uint sign7 = qs >> 9;
|
| 160 |
+
// Add parity bit
|
| 161 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 162 |
+
const uint sign = sign8 >> (iqs % 8);
|
| 163 |
+
const u8vec4 grid = unpack8(iq2xs_grid[qs & 511][(iqs % 8) / 4] >> (8 * (iqs % 4)));
|
| 164 |
+
bool sign0 = (sign & 1) != 0;
|
| 165 |
+
bool sign1 = (sign & 2) != 0;
|
| 166 |
+
bool sign2 = (sign & 4) != 0;
|
| 167 |
+
bool sign3 = (sign & 8) != 0;
|
| 168 |
+
return db * vec4(
|
| 169 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 170 |
+
grid.y * (sign1 ? -1.0 : 1.0),
|
| 171 |
+
grid.z * (sign2 ? -1.0 : 1.0),
|
| 172 |
+
grid.w * (sign3 ? -1.0 : 1.0)
|
| 173 |
+
);
|
| 174 |
+
}
|
| 175 |
+
#endif
|
| 176 |
+
|
| 177 |
+
#if defined(DATA_A_IQ2_S)
|
| 178 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 179 |
+
const uint ib32 = iqs / 32;
|
| 180 |
+
const uint ib8 = iqs / 8;
|
| 181 |
+
|
| 182 |
+
const uint scale = (data_a[a_offset + ib].scales[ib32] >> (4 * ((iqs / 16) & 1))) & 0xf;
|
| 183 |
+
const uint qs = data_a[a_offset + ib].qs[ib8];
|
| 184 |
+
const uint qh = data_a[a_offset + ib].qh[ib32];
|
| 185 |
+
const uint qhshift = 2 * (ib8 % 4);
|
| 186 |
+
const uint sign = data_a[a_offset + ib].qs[QUANT_K / 8 + ib8] >> (iqs % 8);
|
| 187 |
+
|
| 188 |
+
const float db = 0.25 * (0.5 + scale);
|
| 189 |
+
const u8vec4 grid = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(iqs % 8) / 4]);
|
| 190 |
+
bool sign0 = (sign & 1) != 0;
|
| 191 |
+
bool sign1 = (sign & 2) != 0;
|
| 192 |
+
return db * vec2(
|
| 193 |
+
grid[iqs % 4] * (sign0 ? -1.0 : 1.0),
|
| 194 |
+
grid[(iqs % 4) + 1] * (sign1 ? -1.0 : 1.0)
|
| 195 |
+
);
|
| 196 |
+
}
|
| 197 |
+
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
| 198 |
+
const uint ib32 = iqs / 32;
|
| 199 |
+
const uint ib8 = iqs / 8;
|
| 200 |
+
|
| 201 |
+
const uint scale = (data_a[a_offset + ib].scales[ib32] >> (4 * ((iqs / 16) & 1))) & 0xf;
|
| 202 |
+
const uint qs = data_a[a_offset + ib].qs[ib8];
|
| 203 |
+
const uint qh = data_a[a_offset + ib].qh[ib32];
|
| 204 |
+
const uint qhshift = 2 * (ib8 % 4);
|
| 205 |
+
const uint sign = data_a[a_offset + ib].qs[QUANT_K / 8 + ib8] >> (iqs % 8);
|
| 206 |
+
|
| 207 |
+
const float db = 0.25 * (0.5 + scale);
|
| 208 |
+
const u8vec4 grid = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(iqs % 8) / 4]);
|
| 209 |
+
bool sign0 = (sign & 1) != 0;
|
| 210 |
+
bool sign1 = (sign & 2) != 0;
|
| 211 |
+
bool sign2 = (sign & 4) != 0;
|
| 212 |
+
bool sign3 = (sign & 8) != 0;
|
| 213 |
+
return db * vec4(
|
| 214 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 215 |
+
grid.y * (sign1 ? -1.0 : 1.0),
|
| 216 |
+
grid.z * (sign2 ? -1.0 : 1.0),
|
| 217 |
+
grid.w * (sign3 ? -1.0 : 1.0)
|
| 218 |
+
);
|
| 219 |
+
}
|
| 220 |
+
#endif
|
| 221 |
+
|
| 222 |
+
#if defined(DATA_A_IQ3_XXS)
|
| 223 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 224 |
+
const uint ib4 = iqs / 4;
|
| 225 |
+
const uint ib32 = iqs / 32;
|
| 226 |
+
const uint is = QUANT_K / 4 + 4 * ib32;
|
| 227 |
+
const uint qs = data_a[a_offset + ib].qs[ib4];
|
| 228 |
+
// Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
|
| 229 |
+
const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
|
| 230 |
+
data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
|
| 231 |
+
const float db = 0.5 * (0.5 + (signs >> 28));
|
| 232 |
+
const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
|
| 233 |
+
// Add parity bit
|
| 234 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 235 |
+
const uint sign = sign8 >> (iqs % 8);
|
| 236 |
+
const u8vec4 grid = unpack8(iq3xxs_grid[qs] >> (8 * (iqs % 4)));
|
| 237 |
+
bool sign0 = (sign & 1) != 0;
|
| 238 |
+
bool sign1 = (sign & 2) != 0;
|
| 239 |
+
return db * vec2(
|
| 240 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 241 |
+
grid.y * (sign1 ? -1.0 : 1.0)
|
| 242 |
+
);
|
| 243 |
+
}
|
| 244 |
+
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
| 245 |
+
const uint ib4 = iqs / 4;
|
| 246 |
+
const uint ib32 = iqs / 32;
|
| 247 |
+
const uint is = QUANT_K / 4 + 4 * ib32;
|
| 248 |
+
const uint qs = data_a[a_offset + ib].qs[ib4];
|
| 249 |
+
const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
|
| 250 |
+
data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
|
| 251 |
+
const float db = 0.5 * (0.5 + (signs >> 28));
|
| 252 |
+
const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
|
| 253 |
+
// Add parity bit
|
| 254 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 255 |
+
const uint sign = sign8 >> (iqs % 8);
|
| 256 |
+
const u8vec4 grid = unpack8(iq3xxs_grid[qs]);
|
| 257 |
+
bool sign0 = (sign & 1) != 0;
|
| 258 |
+
bool sign1 = (sign & 2) != 0;
|
| 259 |
+
bool sign2 = (sign & 4) != 0;
|
| 260 |
+
bool sign3 = (sign & 8) != 0;
|
| 261 |
+
return db * vec4(
|
| 262 |
+
grid.x * (sign0 ? -1.0 : 1.0),
|
| 263 |
+
grid.y * (sign1 ? -1.0 : 1.0),
|
| 264 |
+
grid.z * (sign2 ? -1.0 : 1.0),
|
| 265 |
+
grid.w * (sign3 ? -1.0 : 1.0)
|
| 266 |
+
);
|
| 267 |
+
}
|
| 268 |
+
#endif
|
| 269 |
+
|
| 270 |
+
#if defined(DATA_A_IQ3_S)
|
| 271 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 272 |
+
const uint qs = data_a[a_offset + ib].qs[iqs / 4];
|
| 273 |
+
const uint qh = data_a[a_offset + ib].qh[iqs / 32];
|
| 274 |
+
const uint sign = data_a[a_offset + ib].signs[iqs / 8] >> (iqs % 8);
|
| 275 |
+
const uint scale = data_a[a_offset + ib].scales[iqs / 64];
|
| 276 |
+
bool sign0 = (sign & 1) != 0;
|
| 277 |
+
bool sign1 = (sign & 2) != 0;
|
| 278 |
+
const float db = 1 + 2 * ((scale >> (4 * ((iqs / 32) & 1))) & 0xf);
|
| 279 |
+
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - ((iqs / 4) % 8))) & 256)] >> (8 * (iqs % 4));
|
| 280 |
+
return db * vec2(
|
| 281 |
+
int(grid & 0xFF) * (sign0 ? -1.0 : 1.0),
|
| 282 |
+
int((grid >> 8) & 0xFF) * (sign1 ? -1.0 : 1.0)
|
| 283 |
+
);
|
| 284 |
+
}
|
| 285 |
+
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
| 286 |
+
const uint ib4 = iqs / 4;
|
| 287 |
+
const uint ib32 = iqs / 32;
|
| 288 |
+
const uint qs = data_a[a_offset + ib].qs[ib4];
|
| 289 |
+
const uint qh = data_a[a_offset + ib].qh[ib32];
|
| 290 |
+
const uint sign = data_a[a_offset + ib].signs[iqs / 8] >> (iqs % 8);
|
| 291 |
+
const uint scale = data_a[a_offset + ib].scales[ib32 / 2];
|
| 292 |
+
bool sign0 = (sign & 1) != 0;
|
| 293 |
+
bool sign1 = (sign & 2) != 0;
|
| 294 |
+
bool sign2 = (sign & 4) != 0;
|
| 295 |
+
bool sign3 = (sign & 8) != 0;
|
| 296 |
+
const float db = 1 + 2 * ((scale >> (4 * (ib32 & 1))) & 0xf);
|
| 297 |
+
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - ib4 % 8)) & 256)] >> (8 * (iqs % 4));
|
| 298 |
+
return db * vec4(
|
| 299 |
+
int(grid & 0xFF) * (sign0 ? -1.0 : 1.0),
|
| 300 |
+
int((grid >> 8) & 0xFF) * (sign1 ? -1.0 : 1.0),
|
| 301 |
+
int((grid >> 16) & 0xFF) * (sign2 ? -1.0 : 1.0),
|
| 302 |
+
int((grid >> 24) & 0xFF) * (sign3 ? -1.0 : 1.0)
|
| 303 |
+
);
|
| 304 |
+
}
|
| 305 |
+
#endif
|
| 306 |
+
|
| 307 |
#if defined(DATA_A_IQ4_NL)
|
| 308 |
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 309 |
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
|
|
|
| 321 |
}
|
| 322 |
#endif
|
| 323 |
|
| 324 |
+
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 325 |
vec2 get_dm(uint ib, uint a_offset) {
|
| 326 |
return vec2(float(data_a[a_offset + ib].d), 0);
|
| 327 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
CHANGED
|
@@ -301,6 +301,160 @@ float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2
|
|
| 301 |
return ret;
|
| 302 |
}
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
#if defined(DATA_A_IQ4_NL)
|
| 305 |
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
|
| 306 |
block_iq4_nl block;
|
|
@@ -340,6 +494,16 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
|
|
| 340 |
#define dequantFuncA dequantFuncQ5_K
|
| 341 |
#elif defined(DATA_A_Q6_K)
|
| 342 |
#define dequantFuncA dequantFuncQ6_K
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
#elif defined(DATA_A_IQ4_NL)
|
| 344 |
#define dequantFuncA dequantFuncIQ4_NL
|
| 345 |
#endif
|
|
|
|
| 301 |
return ret;
|
| 302 |
}
|
| 303 |
|
| 304 |
+
#if defined(DATA_A_IQ2_XXS)
|
| 305 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_XXS {
|
| 306 |
+
block_iq2_xxs block;
|
| 307 |
+
};
|
| 308 |
+
|
| 309 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_XXS_packed16 {
|
| 310 |
+
block_iq2_xxs_packed16 block;
|
| 311 |
+
};
|
| 312 |
+
|
| 313 |
+
float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
| 314 |
+
{
|
| 315 |
+
decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl);
|
| 316 |
+
const float16_t d = bl.block.d;
|
| 317 |
+
const uint idx = coordInBlock[1];
|
| 318 |
+
|
| 319 |
+
const uint ib32 = (idx & 0xE0) >> 5; // 0..7
|
| 320 |
+
const uint ib8 = (idx & 0x18) >> 3; // 0..3
|
| 321 |
+
const uint iqs = 8 * ib32 + ib8;
|
| 322 |
+
|
| 323 |
+
const uint8_t qs = bl.block.qs[iqs];
|
| 324 |
+
const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
|
| 325 |
+
|
| 326 |
+
const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t(signscale >> 28));
|
| 327 |
+
uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
|
| 328 |
+
sign |= bitCount(sign) << 7;
|
| 329 |
+
|
| 330 |
+
const uint8_t g = unpack8(iq2xxs_grid[qs][(idx & 4) >> 2])[idx & 3];
|
| 331 |
+
|
| 332 |
+
float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
|
| 333 |
+
|
| 334 |
+
return ret;
|
| 335 |
+
}
|
| 336 |
+
#endif
|
| 337 |
+
|
| 338 |
+
#if defined(DATA_A_IQ2_XS)
|
| 339 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_XS {
|
| 340 |
+
block_iq2_xs block;
|
| 341 |
+
};
|
| 342 |
+
|
| 343 |
+
float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
| 344 |
+
{
|
| 345 |
+
const float16_t d = bl.block.d;
|
| 346 |
+
const uint idx = coordInBlock[1];
|
| 347 |
+
|
| 348 |
+
const uint is = (idx & 0xE0) >> 5; // 0..8
|
| 349 |
+
const uint sshift = (idx & 0x10) >> 2; // 0,4
|
| 350 |
+
const uint iqs = (idx & 0xF8) >> 3; // 0..63
|
| 351 |
+
|
| 352 |
+
const uint16_t qs = bl.block.qs[iqs];
|
| 353 |
+
const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t((bl.block.scales[is] >> sshift) & 0xF));
|
| 354 |
+
|
| 355 |
+
uint sign = uint(qs >> 9);
|
| 356 |
+
sign |= bitCount(sign) << 7;
|
| 357 |
+
const uint8_t g = unpack8(iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2])[idx & 3];
|
| 358 |
+
|
| 359 |
+
float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
|
| 360 |
+
return ret;
|
| 361 |
+
}
|
| 362 |
+
#endif
|
| 363 |
+
|
| 364 |
+
#if defined(DATA_A_IQ2_S)
|
| 365 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_S {
|
| 366 |
+
block_iq2_s block;
|
| 367 |
+
};
|
| 368 |
+
|
| 369 |
+
float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
| 370 |
+
{
|
| 371 |
+
uint idx = coordInBlock[1];
|
| 372 |
+
uint lsb = idx & 1;
|
| 373 |
+
idx /= 2;
|
| 374 |
+
|
| 375 |
+
const uint ib8 = (idx % 128) / 4; // 0..31
|
| 376 |
+
const uint ib32 = ib8 / 4; // 0..7
|
| 377 |
+
|
| 378 |
+
const uint scale = (bl.block.scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
| 379 |
+
const uint qs = bl.block.qs[ib8];
|
| 380 |
+
const uint qh = bl.block.qh[ib32];
|
| 381 |
+
const uint qhshift = 2 * (ib8 % 4);
|
| 382 |
+
const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4));
|
| 383 |
+
|
| 384 |
+
const float d = float(bl.block.d);
|
| 385 |
+
const float db = d * 0.25 * (0.5 + scale);
|
| 386 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
|
| 387 |
+
const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1];
|
| 388 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid));
|
| 389 |
+
return float16_t(v[lsb]);
|
| 390 |
+
}
|
| 391 |
+
#endif
|
| 392 |
+
|
| 393 |
+
#if defined(DATA_A_IQ3_XXS)
|
| 394 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3_XXS {
|
| 395 |
+
block_iq3_xxs block;
|
| 396 |
+
};
|
| 397 |
+
|
| 398 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3_XXS_packed16 {
|
| 399 |
+
block_iq3_xxs_packed16 block;
|
| 400 |
+
};
|
| 401 |
+
|
| 402 |
+
float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
| 403 |
+
{
|
| 404 |
+
uint idx = coordInBlock[1];
|
| 405 |
+
uint lsb = idx & 1;
|
| 406 |
+
idx /= 2;
|
| 407 |
+
|
| 408 |
+
const uint iqs = (idx % 128) / 2; // 0..63
|
| 409 |
+
const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
|
| 410 |
+
|
| 411 |
+
const float d = float(bl.block.d);
|
| 412 |
+
const uint qs = bl.block.qs[iqs];
|
| 413 |
+
const uint signs = pack32(u8vec4(
|
| 414 |
+
bl.block.qs[is+0],
|
| 415 |
+
bl.block.qs[is+1],
|
| 416 |
+
bl.block.qs[is+2],
|
| 417 |
+
bl.block.qs[is+3]
|
| 418 |
+
));
|
| 419 |
+
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
| 420 |
+
const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
| 421 |
+
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
|
| 422 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
|
| 423 |
+
const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1));
|
| 424 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 425 |
+
return float16_t(v[lsb]);
|
| 426 |
+
}
|
| 427 |
+
#endif
|
| 428 |
+
|
| 429 |
+
#if defined(DATA_A_IQ3_S)
|
| 430 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3_S {
|
| 431 |
+
block_iq3_s block;
|
| 432 |
+
};
|
| 433 |
+
|
| 434 |
+
float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
| 435 |
+
{
|
| 436 |
+
uint idx = coordInBlock[1];
|
| 437 |
+
uint lsb = idx & 1;
|
| 438 |
+
idx /= 2;
|
| 439 |
+
|
| 440 |
+
const uint iqs = (idx % 128) / 2; // 0..63
|
| 441 |
+
const uint iqh = iqs / 8;
|
| 442 |
+
|
| 443 |
+
const float d = float(bl.block.d);
|
| 444 |
+
const uint qs = bl.block.qs[iqs];
|
| 445 |
+
const uint qh = bl.block.qh[iqh];
|
| 446 |
+
const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (2 * (idx % 4)));
|
| 447 |
+
const uint scale = bl.block.scales[iqs / 16];
|
| 448 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
|
| 449 |
+
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
|
| 450 |
+
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
|
| 451 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 452 |
+
|
| 453 |
+
return float16_t(v[lsb]);
|
| 454 |
+
}
|
| 455 |
+
#endif
|
| 456 |
+
|
| 457 |
+
|
| 458 |
#if defined(DATA_A_IQ4_NL)
|
| 459 |
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
|
| 460 |
block_iq4_nl block;
|
|
|
|
| 494 |
#define dequantFuncA dequantFuncQ5_K
|
| 495 |
#elif defined(DATA_A_Q6_K)
|
| 496 |
#define dequantFuncA dequantFuncQ6_K
|
| 497 |
+
#elif defined(DATA_A_IQ2_XXS)
|
| 498 |
+
#define dequantFuncA dequantFuncIQ2_XXS
|
| 499 |
+
#elif defined(DATA_A_IQ2_XS)
|
| 500 |
+
#define dequantFuncA dequantFuncIQ2_XS
|
| 501 |
+
#elif defined(DATA_A_IQ2_S)
|
| 502 |
+
#define dequantFuncA dequantFuncIQ2_S
|
| 503 |
+
#elif defined(DATA_A_IQ3_XXS)
|
| 504 |
+
#define dequantFuncA dequantFuncIQ3_XXS
|
| 505 |
+
#elif defined(DATA_A_IQ3_S)
|
| 506 |
+
#define dequantFuncA dequantFuncIQ3_S
|
| 507 |
#elif defined(DATA_A_IQ4_NL)
|
| 508 |
#define dequantFuncA dequantFuncIQ4_NL
|
| 509 |
#endif
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq2_s data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
// Each thread handles 1 subblock (32 values with 2 scales)
|
| 12 |
+
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
| 13 |
+
|
| 14 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 15 |
+
|
| 16 |
+
if (ib >= p.nel / 256) {
|
| 17 |
+
return;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
const uint ib32 = gl_LocalInvocationID.x % 8;
|
| 21 |
+
const uint b_idx = 256 * ib + 32 * ib32;
|
| 22 |
+
|
| 23 |
+
const float d = float(data_a[ib].d);
|
| 24 |
+
const vec2 scale = vec2(data_a[ib].scales[ib32] & 0xf, data_a[ib].scales[ib32] >> 4);
|
| 25 |
+
const vec2 db = d * (0.5 + scale) * 0.25;
|
| 26 |
+
|
| 27 |
+
uint qh = data_a[ib].qh[ib32];
|
| 28 |
+
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
| 29 |
+
uint qs = data_a[ib].qs[4 * ib32 + l];
|
| 30 |
+
const uint8_t sign = data_a[ib].qs[QUANT_K / 8 + 4 * ib32 + l];
|
| 31 |
+
qs |= (qh << (8 - 2 * l)) & 0x300;
|
| 32 |
+
const uvec2 grid = iq2s_grid[qs & 511];
|
| 33 |
+
const u8vec4 grid0 = unpack8(grid.x);
|
| 34 |
+
const u8vec4 grid1 = unpack8(grid.y);
|
| 35 |
+
data_b[b_idx + 8 * l + 0] = D_TYPE(db[l/2] * grid0.x * ((sign & 1) != 0 ? -1.0 : 1.0));
|
| 36 |
+
data_b[b_idx + 8 * l + 1] = D_TYPE(db[l/2] * grid0.y * ((sign & 2) != 0 ? -1.0 : 1.0));
|
| 37 |
+
data_b[b_idx + 8 * l + 2] = D_TYPE(db[l/2] * grid0.z * ((sign & 4) != 0 ? -1.0 : 1.0));
|
| 38 |
+
data_b[b_idx + 8 * l + 3] = D_TYPE(db[l/2] * grid0.w * ((sign & 8) != 0 ? -1.0 : 1.0));
|
| 39 |
+
data_b[b_idx + 8 * l + 4] = D_TYPE(db[l/2] * grid1.x * ((sign & 16) != 0 ? -1.0 : 1.0));
|
| 40 |
+
data_b[b_idx + 8 * l + 5] = D_TYPE(db[l/2] * grid1.y * ((sign & 32) != 0 ? -1.0 : 1.0));
|
| 41 |
+
data_b[b_idx + 8 * l + 6] = D_TYPE(db[l/2] * grid1.z * ((sign & 64) != 0 ? -1.0 : 1.0));
|
| 42 |
+
data_b[b_idx + 8 * l + 7] = D_TYPE(db[l/2] * grid1.w * ((sign & 128) != 0 ? -1.0 : 1.0));
|
| 43 |
+
}
|
| 44 |
+
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq2_xs data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
// Each thread handles 1 subblock (32 values with 2 scales)
|
| 12 |
+
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
| 13 |
+
|
| 14 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 15 |
+
|
| 16 |
+
if (ib >= p.nel / 256) {
|
| 17 |
+
return;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
const uint ib32 = gl_LocalInvocationID.x % 8;
|
| 21 |
+
const uint b_idx = 256 * ib + 32 * ib32;
|
| 22 |
+
|
| 23 |
+
const float d = float(data_a[ib].d);
|
| 24 |
+
const vec2 scale = vec2(data_a[ib].scales[ib32] & 0xf, data_a[ib].scales[ib32] >> 4);
|
| 25 |
+
const vec2 db = d * (0.5 + scale) * 0.25;
|
| 26 |
+
|
| 27 |
+
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
| 28 |
+
uint16_t qs = data_a[ib].qs[4 * ib32 + l];
|
| 29 |
+
const uint sign7 = qs >> 9;
|
| 30 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7); // parity bit
|
| 31 |
+
const uvec2 grid = iq2xs_grid[qs & 511];
|
| 32 |
+
const u8vec4 grid0 = unpack8(grid.x);
|
| 33 |
+
const u8vec4 grid1 = unpack8(grid.y);
|
| 34 |
+
data_b[b_idx + 8 * l + 0] = D_TYPE(db[l/2] * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
|
| 35 |
+
data_b[b_idx + 8 * l + 1] = D_TYPE(db[l/2] * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
|
| 36 |
+
data_b[b_idx + 8 * l + 2] = D_TYPE(db[l/2] * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
|
| 37 |
+
data_b[b_idx + 8 * l + 3] = D_TYPE(db[l/2] * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
|
| 38 |
+
data_b[b_idx + 8 * l + 4] = D_TYPE(db[l/2] * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
|
| 39 |
+
data_b[b_idx + 8 * l + 5] = D_TYPE(db[l/2] * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
|
| 40 |
+
data_b[b_idx + 8 * l + 6] = D_TYPE(db[l/2] * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
|
| 41 |
+
data_b[b_idx + 8 * l + 7] = D_TYPE(db[l/2] * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
|
| 42 |
+
}
|
| 43 |
+
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq2_xxs data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
// Each thread handles 1 scale block (32 values)
|
| 12 |
+
// Each block is described by 4 lattice indices, 4x7 sign bits and 4 scale bits
|
| 13 |
+
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
| 14 |
+
|
| 15 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 16 |
+
|
| 17 |
+
if (ib >= p.nel / 256) {
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
const uint is = gl_LocalInvocationID.x % 8;
|
| 22 |
+
const uint b_idx = 256 * ib + 32 * is;
|
| 23 |
+
|
| 24 |
+
const float d = float(data_a[ib].d);
|
| 25 |
+
uint signscale = pack32(u8vec4(
|
| 26 |
+
data_a[ib].qs[8*is + 4],
|
| 27 |
+
data_a[ib].qs[8*is + 5],
|
| 28 |
+
data_a[ib].qs[8*is + 6],
|
| 29 |
+
data_a[ib].qs[8*is + 7]
|
| 30 |
+
));
|
| 31 |
+
const float db = d * (0.5 + (signscale >> 28)) * 0.25;
|
| 32 |
+
|
| 33 |
+
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
| 34 |
+
const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
|
| 35 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7); // parity bit
|
| 36 |
+
const uvec2 grid = iq2xxs_grid[data_a[ib].qs[8 * is + l]];
|
| 37 |
+
const u8vec4 grid0 = unpack8(grid.x);
|
| 38 |
+
const u8vec4 grid1 = unpack8(grid.y);
|
| 39 |
+
data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
|
| 40 |
+
data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
|
| 41 |
+
data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
|
| 42 |
+
data_b[b_idx + 8 * l + 3] = D_TYPE(db * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
|
| 43 |
+
data_b[b_idx + 8 * l + 4] = D_TYPE(db * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
|
| 44 |
+
data_b[b_idx + 8 * l + 5] = D_TYPE(db * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
|
| 45 |
+
data_b[b_idx + 8 * l + 6] = D_TYPE(db * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
|
| 46 |
+
data_b[b_idx + 8 * l + 7] = D_TYPE(db * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
|
| 47 |
+
}
|
| 48 |
+
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq3_s data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
// Each thread handles 1 scale nibble.
|
| 12 |
+
// Each block contains 4 scale bytes (8 scales) for 256 output values.
|
| 13 |
+
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
| 14 |
+
|
| 15 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 16 |
+
|
| 17 |
+
if (ib >= p.nel / 256) {
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
const uint is = gl_LocalInvocationID.x % 8;
|
| 22 |
+
const uint b_idx = 256 * ib + 32 * is;
|
| 23 |
+
|
| 24 |
+
const float d = float(data_a[ib].d);
|
| 25 |
+
const float db = d * (1 + 2 * ((data_a[ib].scales[is] >> (4 * (is % 2))) & 0xf));
|
| 26 |
+
|
| 27 |
+
// We must produce 32 values using 4 sign bytes, 1 qh byte, 8 qs bytes.
|
| 28 |
+
uint qh = data_a[ib].qh[is];
|
| 29 |
+
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
| 30 |
+
uint qs = data_a[ib].qs[8 * is + l];
|
| 31 |
+
uint gidx = qs | ((qh << (8 - l)) & 256);
|
| 32 |
+
uint8_t signs = data_a[ib].signs[8 * is + l / 2] >> (4 * (l & 1));
|
| 33 |
+
u8vec4 grid = unpack8(iq3s_grid[gidx]);
|
| 34 |
+
data_b[b_idx + 4 * l + 0] = D_TYPE(db * grid.x * ((signs & 1) != 0 ? -1.0 : 1.0));
|
| 35 |
+
data_b[b_idx + 4 * l + 1] = D_TYPE(db * grid.y * ((signs & 2) != 0 ? -1.0 : 1.0));
|
| 36 |
+
data_b[b_idx + 4 * l + 2] = D_TYPE(db * grid.z * ((signs & 4) != 0 ? -1.0 : 1.0));
|
| 37 |
+
data_b[b_idx + 4 * l + 3] = D_TYPE(db * grid.w * ((signs & 8) != 0 ? -1.0 : 1.0));
|
| 38 |
+
}
|
| 39 |
+
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq3_xxs data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
// Each thread handles 1 scale block (32 values)
|
| 12 |
+
// 8 threads handle 1 superblock
|
| 13 |
+
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
| 14 |
+
|
| 15 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 16 |
+
|
| 17 |
+
if (ib >= p.nel / 256) {
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
const uint is = gl_LocalInvocationID.x % 8;
|
| 22 |
+
const uint b_idx = 256 * ib + 32 * is;
|
| 23 |
+
const uint s_idx = QUANT_K / 4 + 4 * is;
|
| 24 |
+
|
| 25 |
+
const float d = float(data_a[ib].d);
|
| 26 |
+
uint signscale = pack32(u8vec4(
|
| 27 |
+
data_a[ib].qs[s_idx + 0],
|
| 28 |
+
data_a[ib].qs[s_idx + 1],
|
| 29 |
+
data_a[ib].qs[s_idx + 2],
|
| 30 |
+
data_a[ib].qs[s_idx + 3]
|
| 31 |
+
));
|
| 32 |
+
const float db = d * (0.5 + (signscale >> 28)) * 0.5;
|
| 33 |
+
|
| 34 |
+
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
| 35 |
+
const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
|
| 36 |
+
// Restore parity bit.
|
| 37 |
+
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
| 38 |
+
const u8vec4 grid0 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l]]);
|
| 39 |
+
const u8vec4 grid1 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l + 1]]);
|
| 40 |
+
data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
|
| 41 |
+
data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
|
| 42 |
+
data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
|
| 43 |
+
data_b[b_idx + 8 * l + 3] = D_TYPE(db * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
|
| 44 |
+
data_b[b_idx + 8 * l + 4] = D_TYPE(db * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
|
| 45 |
+
data_b[b_idx + 8 * l + 5] = D_TYPE(db * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
|
| 46 |
+
data_b[b_idx + 8 * l + 6] = D_TYPE(db * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
|
| 47 |
+
data_b[b_idx + 8 * l + 7] = D_TYPE(db * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
|
| 48 |
+
}
|
| 49 |
+
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
CHANGED
|
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
|
| 10 |
void main() {
|
| 11 |
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
| 12 |
|
| 13 |
-
|
| 14 |
|
| 15 |
const uint tid = gl_LocalInvocationID.x % 64;
|
| 16 |
const uint il = tid/32;
|
|
|
|
| 10 |
void main() {
|
| 11 |
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
| 12 |
|
| 13 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 14 |
|
| 15 |
const uint tid = gl_LocalInvocationID.x % 64;
|
| 16 |
const uint il = tid/32;
|
ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
CHANGED
|
@@ -104,8 +104,8 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
|
|
| 104 |
#endif
|
| 105 |
|
| 106 |
void main() {
|
| 107 |
-
#if defined(DATA_A_IQ4_NL)
|
| 108 |
-
|
| 109 |
#endif
|
| 110 |
|
| 111 |
const uint32_t N = p.N;
|
|
|
|
| 104 |
#endif
|
| 105 |
|
| 106 |
void main() {
|
| 107 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 108 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 109 |
#endif
|
| 110 |
|
| 111 |
const uint32_t N = p.N;
|
ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
CHANGED
|
@@ -12,8 +12,8 @@ void main() {
|
|
| 12 |
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
| 13 |
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
| 14 |
|
| 15 |
-
#if defined(DATA_A_IQ4_NL)
|
| 16 |
-
|
| 17 |
#endif
|
| 18 |
|
| 19 |
if (i00 >= p.ne00) {
|
|
|
|
| 12 |
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
| 13 |
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
| 14 |
|
| 15 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 16 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 17 |
#endif
|
| 18 |
|
| 19 |
if (i00 >= p.ne00) {
|
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
CHANGED
|
@@ -133,8 +133,8 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|
| 133 |
void main() {
|
| 134 |
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
| 135 |
|
| 136 |
-
#if defined(DATA_A_IQ4_NL)
|
| 137 |
-
|
| 138 |
#endif
|
| 139 |
|
| 140 |
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
|
|
|
| 133 |
void main() {
|
| 134 |
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
| 135 |
|
| 136 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 137 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 138 |
#endif
|
| 139 |
|
| 140 |
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
CHANGED
|
@@ -95,8 +95,8 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
|
| 95 |
#endif
|
| 96 |
|
| 97 |
void main() {
|
| 98 |
-
#if defined(DATA_A_IQ4_NL)
|
| 99 |
-
|
| 100 |
#endif
|
| 101 |
|
| 102 |
#ifdef MUL_MAT_ID
|
|
@@ -343,10 +343,8 @@ void main() {
|
|
| 343 |
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
| 344 |
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
| 345 |
|
| 346 |
-
const int8_t us = int8_t(
|
| 347 |
-
|
| 348 |
-
is < 12 ? (data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) :
|
| 349 |
-
(data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4));
|
| 350 |
const float dl = float(data_a[ib].d) * float(us - 32);
|
| 351 |
|
| 352 |
buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)));
|
|
@@ -439,6 +437,118 @@ void main() {
|
|
| 439 |
|
| 440 |
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
| 441 |
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
#elif defined(DATA_A_IQ4_NL)
|
| 443 |
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 444 |
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
|
|
|
| 95 |
#endif
|
| 96 |
|
| 97 |
void main() {
|
| 98 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 99 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 100 |
#endif
|
| 101 |
|
| 102 |
#ifdef MUL_MAT_ID
|
|
|
|
| 343 |
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
| 344 |
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
| 345 |
|
| 346 |
+
const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
|
| 347 |
+
| (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
|
|
|
|
|
|
|
| 348 |
const float dl = float(data_a[ib].d) * float(us - 32);
|
| 349 |
|
| 350 |
buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)));
|
|
|
|
| 437 |
|
| 438 |
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
| 439 |
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
| 440 |
+
#elif defined(DATA_A_IQ2_XXS)
|
| 441 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 442 |
+
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
| 443 |
+
|
| 444 |
+
const uint ib = idx / 128; // 2 values per idx
|
| 445 |
+
const uint ib32 = (idx % 128) / 16; // 0..7
|
| 446 |
+
const uint ib8 = (idx / 4) % 4;
|
| 447 |
+
|
| 448 |
+
const float d = float(data_a[ib].d);
|
| 449 |
+
const uint qs = data_a[ib].qs[8 * ib32 + ib8];
|
| 450 |
+
const uint signs = pack32(u8vec4(
|
| 451 |
+
data_a[ib].qs[8*ib32 + 4],
|
| 452 |
+
data_a[ib].qs[8*ib32 + 5],
|
| 453 |
+
data_a[ib].qs[8*ib32 + 6],
|
| 454 |
+
data_a[ib].qs[8*ib32 + 7]
|
| 455 |
+
));
|
| 456 |
+
const float db = d * 0.25 * (0.5 + (signs >> 28));
|
| 457 |
+
const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
|
| 458 |
+
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
|
| 459 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
|
| 460 |
+
const uint grid = iq2xxs_grid[qs][(idx % 4) / 2] >> (16 * (idx & 1));
|
| 461 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 462 |
+
|
| 463 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 464 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 465 |
+
#elif defined(DATA_A_IQ2_XS)
|
| 466 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 467 |
+
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
| 468 |
+
|
| 469 |
+
const uint ib = idx / 128; // 2 values per idx
|
| 470 |
+
const uint ib32 = (idx % 128) / 16; // 0..7
|
| 471 |
+
const uint ib8 = (idx / 4) % 4; // 0..3
|
| 472 |
+
|
| 473 |
+
const float d = float(data_a[ib].d);
|
| 474 |
+
const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
| 475 |
+
const float db = d * 0.25 * (0.5 + scale);
|
| 476 |
+
const uint qs = data_a[ib].qs[4 * ib32 + ib8];
|
| 477 |
+
const uint sign7 = qs >> 9;
|
| 478 |
+
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
|
| 479 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
|
| 480 |
+
const uint grid = iq2xs_grid[qs & 511][(idx % 4) / 2] >> (16 * (idx & 1));
|
| 481 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 482 |
+
|
| 483 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 484 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 485 |
+
#elif defined(DATA_A_IQ2_S)
|
| 486 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 487 |
+
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
| 488 |
+
|
| 489 |
+
const uint ib = idx / 128; // 2 values per idx
|
| 490 |
+
const uint ib8 = (idx % 128) / 4; // 0..31
|
| 491 |
+
const uint ib32 = ib8 / 4; // 0..7
|
| 492 |
+
|
| 493 |
+
const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
| 494 |
+
const uint qs = data_a[ib].qs[ib8];
|
| 495 |
+
const uint qh = data_a[ib].qh[ib32];
|
| 496 |
+
const uint qhshift = 2 * (ib8 % 4);
|
| 497 |
+
const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4));
|
| 498 |
+
|
| 499 |
+
const float d = float(data_a[ib].d);
|
| 500 |
+
const float db = d * 0.25 * (0.5 + scale);
|
| 501 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
|
| 502 |
+
const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1];
|
| 503 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid));
|
| 504 |
+
|
| 505 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 506 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 507 |
+
#elif defined(DATA_A_IQ3_XXS)
|
| 508 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 509 |
+
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
| 510 |
+
|
| 511 |
+
const uint ib = idx / 128; // 2 values per idx
|
| 512 |
+
const uint iqs = (idx % 128) / 2; // 0..63
|
| 513 |
+
const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
|
| 514 |
+
|
| 515 |
+
const float d = float(data_a[ib].d);
|
| 516 |
+
const uint qs = data_a[ib].qs[iqs];
|
| 517 |
+
const uint signs = pack32(u8vec4(
|
| 518 |
+
data_a[ib].qs[is+0],
|
| 519 |
+
data_a[ib].qs[is+1],
|
| 520 |
+
data_a[ib].qs[is+2],
|
| 521 |
+
data_a[ib].qs[is+3]
|
| 522 |
+
));
|
| 523 |
+
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
| 524 |
+
const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
| 525 |
+
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
|
| 526 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
|
| 527 |
+
const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1));
|
| 528 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 529 |
+
|
| 530 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 531 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 532 |
+
#elif defined(DATA_A_IQ3_S)
|
| 533 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 534 |
+
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
| 535 |
+
|
| 536 |
+
const uint ib = idx / 128; // 2 values per idx
|
| 537 |
+
const uint iqs = (idx % 128) / 2; // 0..63
|
| 538 |
+
const uint iqh = iqs / 8;
|
| 539 |
+
|
| 540 |
+
const float d = float(data_a[ib].d);
|
| 541 |
+
const uint qs = data_a[ib].qs[iqs];
|
| 542 |
+
const uint qh = data_a[ib].qh[iqh];
|
| 543 |
+
const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (2 * (idx % 4)));
|
| 544 |
+
const uint scale = data_a[ib].scales[iqs / 16];
|
| 545 |
+
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
|
| 546 |
+
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
|
| 547 |
+
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
|
| 548 |
+
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 549 |
+
|
| 550 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 551 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 552 |
#elif defined(DATA_A_IQ4_NL)
|
| 553 |
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 554 |
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
CHANGED
|
@@ -106,8 +106,8 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
|
|
| 106 |
#endif
|
| 107 |
|
| 108 |
void main() {
|
| 109 |
-
#if defined(DATA_A_IQ4_NL)
|
| 110 |
-
|
| 111 |
#endif
|
| 112 |
|
| 113 |
#ifdef MUL_MAT_ID
|
|
|
|
| 106 |
#endif
|
| 107 |
|
| 108 |
void main() {
|
| 109 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 110 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 111 |
#endif
|
| 112 |
|
| 113 |
#ifdef MUL_MAT_ID
|
ggml/src/ggml-vulkan/vulkan-shaders/types.comp
CHANGED
|
@@ -294,6 +294,738 @@ struct block_q6_K_packed16
|
|
| 294 |
|
| 295 |
// IQuants
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
#define QUANT_K_IQ4_NL 32
|
| 298 |
#define QUANT_R_IQ4_NL 2
|
| 299 |
|
|
@@ -318,11 +1050,11 @@ const int8_t kvalues_iq4nl_const[16] = {
|
|
| 318 |
|
| 319 |
shared FLOAT_TYPE kvalues_iq4nl[16];
|
| 320 |
|
| 321 |
-
void
|
| 322 |
{
|
| 323 |
// copy the table into shared memory and sync
|
| 324 |
-
|
| 325 |
-
kvalues_iq4nl[
|
| 326 |
}
|
| 327 |
barrier();
|
| 328 |
}
|
|
|
|
| 294 |
|
| 295 |
// IQuants
|
| 296 |
|
| 297 |
+
#define QUANT_K_IQ2_XXS 256
|
| 298 |
+
#define QUANT_R_IQ2_XXS 1
|
| 299 |
+
|
| 300 |
+
struct block_iq2_xxs
|
| 301 |
+
{
|
| 302 |
+
float16_t d;
|
| 303 |
+
uint8_t qs[QUANT_K_IQ2_XXS/4];
|
| 304 |
+
};
|
| 305 |
+
|
| 306 |
+
struct block_iq2_xxs_packed16
|
| 307 |
+
{
|
| 308 |
+
float16_t d;
|
| 309 |
+
uint16_t qs[QUANT_K_IQ2_XXS/8];
|
| 310 |
+
};
|
| 311 |
+
|
| 312 |
+
#if defined(DATA_A_IQ2_XXS)
|
| 313 |
+
|
| 314 |
+
const uvec2[256] iq2xxs_grid_const = {
|
| 315 |
+
uvec2(0x08080808, 0x08080808), uvec2(0x0808082b, 0x08080808), uvec2(0x08081919, 0x08080808), uvec2(0x08082b08, 0x08080808),
|
| 316 |
+
uvec2(0x08082b2b, 0x08080808), uvec2(0x08190819, 0x08080808), uvec2(0x08191908, 0x08080808), uvec2(0x082b0808, 0x08080808),
|
| 317 |
+
uvec2(0x082b082b, 0x08080808), uvec2(0x082b2b08, 0x08080808), uvec2(0x082b2b2b, 0x08080808), uvec2(0x19080819, 0x08080808),
|
| 318 |
+
uvec2(0x19081908, 0x08080808), uvec2(0x19190808, 0x08080808), uvec2(0x19192b08, 0x08080808), uvec2(0x192b0819, 0x08080808),
|
| 319 |
+
uvec2(0x192b1908, 0x08080808), uvec2(0x2b080808, 0x08080808), uvec2(0x2b08082b, 0x08080808), uvec2(0x2b082b2b, 0x08080808),
|
| 320 |
+
uvec2(0x2b2b082b, 0x08080808), uvec2(0x08080819, 0x08080819), uvec2(0x08081908, 0x08080819), uvec2(0x08190808, 0x08080819),
|
| 321 |
+
uvec2(0x08191919, 0x08080819), uvec2(0x19080808, 0x08080819), uvec2(0x2b081908, 0x08080819), uvec2(0x2b192b08, 0x08080819),
|
| 322 |
+
uvec2(0x08080808, 0x0808082b), uvec2(0x0808082b, 0x0808082b), uvec2(0x082b082b, 0x0808082b), uvec2(0x2b08082b, 0x0808082b),
|
| 323 |
+
uvec2(0x08080819, 0x08081908), uvec2(0x08081908, 0x08081908), uvec2(0x08190808, 0x08081908), uvec2(0x082b0819, 0x08081908),
|
| 324 |
+
uvec2(0x082b1908, 0x08081908), uvec2(0x19080808, 0x08081908), uvec2(0x1908082b, 0x08081908), uvec2(0x19082b08, 0x08081908),
|
| 325 |
+
uvec2(0x192b0808, 0x08081908), uvec2(0x2b080819, 0x08081908), uvec2(0x2b081908, 0x08081908), uvec2(0x2b190808, 0x08081908),
|
| 326 |
+
uvec2(0x2b2b1908, 0x08081908), uvec2(0x08080808, 0x08081919), uvec2(0x0808082b, 0x08081919), uvec2(0x08082b08, 0x08081919),
|
| 327 |
+
uvec2(0x082b0808, 0x08081919), uvec2(0x1908192b, 0x08081919), uvec2(0x192b2b19, 0x08081919), uvec2(0x2b080808, 0x08081919),
|
| 328 |
+
uvec2(0x2b190819, 0x08081919), uvec2(0x08082b19, 0x0808192b), uvec2(0x08190808, 0x0808192b), uvec2(0x19080808, 0x0808192b),
|
| 329 |
+
uvec2(0x2b081908, 0x0808192b), uvec2(0x2b2b1908, 0x0808192b), uvec2(0x08080808, 0x08082b08), uvec2(0x08081919, 0x08082b08),
|
| 330 |
+
uvec2(0x08082b08, 0x08082b08), uvec2(0x08191908, 0x08082b08), uvec2(0x082b2b08, 0x08082b08), uvec2(0x19080819, 0x08082b08),
|
| 331 |
+
uvec2(0x19081908, 0x08082b08), uvec2(0x19190808, 0x08082b08), uvec2(0x1919082b, 0x08082b08), uvec2(0x2b082b08, 0x08082b08),
|
| 332 |
+
uvec2(0x08081908, 0x08082b19), uvec2(0x19080808, 0x08082b19), uvec2(0x0808082b, 0x08082b2b), uvec2(0x08191908, 0x08082b2b),
|
| 333 |
+
uvec2(0x08080819, 0x08190808), uvec2(0x08081908, 0x08190808), uvec2(0x08190808, 0x08190808), uvec2(0x082b0819, 0x08190808),
|
| 334 |
+
uvec2(0x19080808, 0x08190808), uvec2(0x192b0808, 0x08190808), uvec2(0x2b081908, 0x08190808), uvec2(0x2b190808, 0x08190808),
|
| 335 |
+
uvec2(0x2b191919, 0x08190808), uvec2(0x08080808, 0x08190819), uvec2(0x08082b08, 0x08190819), uvec2(0x082b0808, 0x08190819),
|
| 336 |
+
uvec2(0x19190808, 0x08190819), uvec2(0x19192b2b, 0x08190819), uvec2(0x2b080808, 0x08190819), uvec2(0x082b1908, 0x0819082b),
|
| 337 |
+
uvec2(0x19081919, 0x0819082b), uvec2(0x08080808, 0x08191908), uvec2(0x08082b08, 0x08191908), uvec2(0x082b0808, 0x08191908),
|
| 338 |
+
uvec2(0x082b1919, 0x08191908), uvec2(0x19082b19, 0x08191908), uvec2(0x2b080808, 0x08191908), uvec2(0x08192b08, 0x08191919),
|
| 339 |
+
uvec2(0x192b082b, 0x08191919), uvec2(0x08080808, 0x0819192b), uvec2(0x0819192b, 0x0819192b), uvec2(0x08080819, 0x08192b08),
|
| 340 |
+
uvec2(0x08081908, 0x08192b08), uvec2(0x08190808, 0x08192b08), uvec2(0x19080808, 0x08192b08), uvec2(0x2b080819, 0x08192b08),
|
| 341 |
+
uvec2(0x08080808, 0x08192b19), uvec2(0x08081919, 0x08192b19), uvec2(0x2b2b0808, 0x08192b19), uvec2(0x19190819, 0x08192b2b),
|
| 342 |
+
uvec2(0x08080808, 0x082b0808), uvec2(0x0808082b, 0x082b0808), uvec2(0x08082b2b, 0x082b0808), uvec2(0x19081908, 0x082b0808),
|
| 343 |
+
uvec2(0x192b0819, 0x082b0808), uvec2(0x2b080808, 0x082b0808), uvec2(0x2b08082b, 0x082b0808), uvec2(0x082b2b19, 0x082b0819),
|
| 344 |
+
uvec2(0x19082b08, 0x082b0819), uvec2(0x08080808, 0x082b082b), uvec2(0x0808082b, 0x082b082b), uvec2(0x08080819, 0x082b1908),
|
| 345 |
+
uvec2(0x08081908, 0x082b1908), uvec2(0x08190808, 0x082b1908), uvec2(0x19080808, 0x082b1908), uvec2(0x1919192b, 0x082b1908),
|
| 346 |
+
uvec2(0x08080808, 0x082b1919), uvec2(0x19080819, 0x082b1919), uvec2(0x192b1908, 0x082b1919), uvec2(0x2b190808, 0x082b192b),
|
| 347 |
+
uvec2(0x08082b08, 0x082b2b08), uvec2(0x082b0808, 0x082b2b08), uvec2(0x2b191908, 0x082b2b08), uvec2(0x19081908, 0x082b2b2b),
|
| 348 |
+
uvec2(0x08080819, 0x19080808), uvec2(0x08081908, 0x19080808), uvec2(0x08190808, 0x19080808), uvec2(0x08192b08, 0x19080808),
|
| 349 |
+
uvec2(0x082b0819, 0x19080808), uvec2(0x082b1908, 0x19080808), uvec2(0x19080808, 0x19080808), uvec2(0x19082b08, 0x19080808),
|
| 350 |
+
uvec2(0x1919192b, 0x19080808), uvec2(0x192b0808, 0x19080808), uvec2(0x2b080819, 0x19080808), uvec2(0x2b081908, 0x19080808),
|
| 351 |
+
uvec2(0x2b190808, 0x19080808), uvec2(0x08080808, 0x19080819), uvec2(0x082b0808, 0x19080819), uvec2(0x192b0819, 0x19080819),
|
| 352 |
+
uvec2(0x2b080808, 0x19080819), uvec2(0x2b081919, 0x19080819), uvec2(0x08080819, 0x1908082b), uvec2(0x08190808, 0x1908082b),
|
| 353 |
+
uvec2(0x19082b08, 0x1908082b), uvec2(0x1919192b, 0x1908082b), uvec2(0x192b2b08, 0x1908082b), uvec2(0x08080808, 0x19081908),
|
| 354 |
+
uvec2(0x08082b08, 0x19081908), uvec2(0x082b0808, 0x19081908), uvec2(0x2b080808, 0x19081908), uvec2(0x2b192b19, 0x19081908),
|
| 355 |
+
uvec2(0x0819082b, 0x19081919), uvec2(0x082b1908, 0x19081919), uvec2(0x08080808, 0x1908192b), uvec2(0x08080819, 0x19082b08),
|
| 356 |
+
uvec2(0x08081908, 0x19082b08), uvec2(0x08190808, 0x19082b08), uvec2(0x19080808, 0x19082b08), uvec2(0x19081919, 0x19082b08),
|
| 357 |
+
uvec2(0x08080808, 0x19082b19), uvec2(0x19192b08, 0x19082b19), uvec2(0x192b0819, 0x19082b19), uvec2(0x2b08082b, 0x19082b19),
|
| 358 |
+
uvec2(0x19081919, 0x19082b2b), uvec2(0x2b190808, 0x19082b2b), uvec2(0x08080808, 0x19190808), uvec2(0x08082b08, 0x19190808),
|
| 359 |
+
uvec2(0x08190819, 0x19190808), uvec2(0x08192b19, 0x19190808), uvec2(0x082b0808, 0x19190808), uvec2(0x2b080808, 0x19190808),
|
| 360 |
+
uvec2(0x2b082b08, 0x19190808), uvec2(0x08081908, 0x19190819), uvec2(0x1908082b, 0x19190819), uvec2(0x2b2b1908, 0x19190819),
|
| 361 |
+
uvec2(0x2b190819, 0x1919082b), uvec2(0x2b190808, 0x19191908), uvec2(0x2b19082b, 0x19191908), uvec2(0x08082b2b, 0x19191919),
|
| 362 |
+
uvec2(0x08080819, 0x1919192b), uvec2(0x19191908, 0x1919192b), uvec2(0x08080808, 0x19192b08), uvec2(0x08190819, 0x19192b08),
|
| 363 |
+
uvec2(0x08192b19, 0x19192b08), uvec2(0x192b1908, 0x19192b08), uvec2(0x19080808, 0x19192b19), uvec2(0x08082b08, 0x19192b2b),
|
| 364 |
+
uvec2(0x08081908, 0x192b0808), uvec2(0x08190808, 0x192b0808), uvec2(0x19080808, 0x192b0808), uvec2(0x192b2b08, 0x192b0808),
|
| 365 |
+
uvec2(0x08080808, 0x192b0819), uvec2(0x19191919, 0x192b0819), uvec2(0x08192b08, 0x192b082b), uvec2(0x192b0808, 0x192b082b),
|
| 366 |
+
uvec2(0x08080808, 0x192b1908), uvec2(0x08081919, 0x192b1908), uvec2(0x08190808, 0x192b1919), uvec2(0x0819082b, 0x192b1919),
|
| 367 |
+
uvec2(0x2b081908, 0x192b1919), uvec2(0x1908082b, 0x192b2b08), uvec2(0x08080808, 0x2b080808), uvec2(0x0808082b, 0x2b080808),
|
| 368 |
+
uvec2(0x08082b2b, 0x2b080808), uvec2(0x19080819, 0x2b080808), uvec2(0x2b08082b, 0x2b080808), uvec2(0x08081908, 0x2b080819),
|
| 369 |
+
uvec2(0x08192b08, 0x2b080819), uvec2(0x19080808, 0x2b080819), uvec2(0x08190819, 0x2b08082b), uvec2(0x08080819, 0x2b081908),
|
| 370 |
+
uvec2(0x08081908, 0x2b081908), uvec2(0x08190808, 0x2b081908), uvec2(0x08191919, 0x2b081908), uvec2(0x19080808, 0x2b081908),
|
| 371 |
+
uvec2(0x192b0808, 0x2b081908), uvec2(0x08080808, 0x2b081919), uvec2(0x1908192b, 0x2b081919), uvec2(0x2b191908, 0x2b081919),
|
| 372 |
+
uvec2(0x08082b19, 0x2b08192b), uvec2(0x19080808, 0x2b08192b), uvec2(0x192b0808, 0x2b08192b), uvec2(0x0808082b, 0x2b082b08),
|
| 373 |
+
uvec2(0x08081908, 0x2b082b19), uvec2(0x08190819, 0x2b082b2b), uvec2(0x08081908, 0x2b190808), uvec2(0x08190808, 0x2b190808),
|
| 374 |
+
uvec2(0x082b1908, 0x2b190808), uvec2(0x19080808, 0x2b190808), uvec2(0x2b2b0819, 0x2b190808), uvec2(0x0819192b, 0x2b190819),
|
| 375 |
+
uvec2(0x2b080808, 0x2b190819), uvec2(0x19081919, 0x2b19082b), uvec2(0x08080808, 0x2b191908), uvec2(0x082b082b, 0x2b191908),
|
| 376 |
+
uvec2(0x19081908, 0x2b191908), uvec2(0x19190819, 0x2b191919), uvec2(0x2b080819, 0x2b192b08), uvec2(0x082b0808, 0x2b192b19),
|
| 377 |
+
uvec2(0x0808082b, 0x2b2b0808), uvec2(0x19190808, 0x2b2b0808), uvec2(0x2b081919, 0x2b2b0808), uvec2(0x08082b19, 0x2b2b0819),
|
| 378 |
+
uvec2(0x08080808, 0x2b2b082b), uvec2(0x08192b08, 0x2b2b1908), uvec2(0x19190808, 0x2b2b2b08), uvec2(0x08081908, 0x2b2b2b19)
|
| 379 |
+
};
|
| 380 |
+
|
| 381 |
+
shared uvec2 iq2xxs_grid[256];
|
| 382 |
+
|
| 383 |
+
void init_iq_shmem(uvec3 wgsize)
|
| 384 |
+
{
|
| 385 |
+
// copy the table into shared memory and sync
|
| 386 |
+
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += wgsize.x) {
|
| 387 |
+
iq2xxs_grid[i] = iq2xxs_grid_const[i];
|
| 388 |
+
}
|
| 389 |
+
barrier();
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
#define QUANT_K QUANT_K_IQ2_XXS
|
| 393 |
+
#define QUANT_R QUANT_R_IQ2_XXS
|
| 394 |
+
#define A_TYPE block_iq2_xxs
|
| 395 |
+
#define A_TYPE_PACKED16 block_iq2_xxs_packed16
|
| 396 |
+
#endif
|
| 397 |
+
|
| 398 |
+
#define QUANT_K_IQ2_XS 256
|
| 399 |
+
#define QUANT_R_IQ2_XS 1
|
| 400 |
+
|
| 401 |
+
struct block_iq2_xs
|
| 402 |
+
{
|
| 403 |
+
float16_t d;
|
| 404 |
+
uint16_t qs[QUANT_K_IQ2_XS/8];
|
| 405 |
+
uint8_t scales[QUANT_K_IQ2_XS/32];
|
| 406 |
+
};
|
| 407 |
+
|
| 408 |
+
struct block_iq2_xs_packed16
|
| 409 |
+
{
|
| 410 |
+
float16_t d;
|
| 411 |
+
uint16_t qs[QUANT_K_IQ2_XS/8];
|
| 412 |
+
uint16_t scales[QUANT_K_IQ2_XS/64];
|
| 413 |
+
};
|
| 414 |
+
|
| 415 |
+
#if defined(DATA_A_IQ2_XS)
|
| 416 |
+
|
| 417 |
+
const uvec2 iq2xs_grid_const[512] = {
|
| 418 |
+
uvec2(0x08080808, 0x08080808), uvec2(0x0808082b, 0x08080808), uvec2(0x08081919, 0x08080808), uvec2(0x08082b08, 0x08080808),
|
| 419 |
+
uvec2(0x08082b2b, 0x08080808), uvec2(0x08190819, 0x08080808), uvec2(0x08191908, 0x08080808), uvec2(0x0819192b, 0x08080808),
|
| 420 |
+
uvec2(0x08192b19, 0x08080808), uvec2(0x082b0808, 0x08080808), uvec2(0x082b082b, 0x08080808), uvec2(0x082b1919, 0x08080808),
|
| 421 |
+
uvec2(0x082b2b08, 0x08080808), uvec2(0x19080819, 0x08080808), uvec2(0x19081908, 0x08080808), uvec2(0x1908192b, 0x08080808),
|
| 422 |
+
uvec2(0x19082b19, 0x08080808), uvec2(0x19190808, 0x08080808), uvec2(0x1919082b, 0x08080808), uvec2(0x19191919, 0x08080808),
|
| 423 |
+
uvec2(0x19192b08, 0x08080808), uvec2(0x192b0819, 0x08080808), uvec2(0x192b1908, 0x08080808), uvec2(0x2b080808, 0x08080808),
|
| 424 |
+
uvec2(0x2b08082b, 0x08080808), uvec2(0x2b081919, 0x08080808), uvec2(0x2b082b08, 0x08080808), uvec2(0x2b190819, 0x08080808),
|
| 425 |
+
uvec2(0x2b191908, 0x08080808), uvec2(0x2b192b19, 0x08080808), uvec2(0x2b2b0808, 0x08080808), uvec2(0x08080819, 0x08080819),
|
| 426 |
+
uvec2(0x08081908, 0x08080819), uvec2(0x0808192b, 0x08080819), uvec2(0x08082b19, 0x08080819), uvec2(0x08190808, 0x08080819),
|
| 427 |
+
uvec2(0x0819082b, 0x08080819), uvec2(0x08191919, 0x08080819), uvec2(0x08192b08, 0x08080819), uvec2(0x08192b2b, 0x08080819),
|
| 428 |
+
uvec2(0x082b0819, 0x08080819), uvec2(0x082b1908, 0x08080819), uvec2(0x19080808, 0x08080819), uvec2(0x1908082b, 0x08080819),
|
| 429 |
+
uvec2(0x19081919, 0x08080819), uvec2(0x19082b08, 0x08080819), uvec2(0x19190819, 0x08080819), uvec2(0x19191908, 0x08080819),
|
| 430 |
+
uvec2(0x192b0808, 0x08080819), uvec2(0x192b2b08, 0x08080819), uvec2(0x2b080819, 0x08080819), uvec2(0x2b081908, 0x08080819),
|
| 431 |
+
uvec2(0x2b190808, 0x08080819), uvec2(0x08080808, 0x0808082b), uvec2(0x0808082b, 0x0808082b), uvec2(0x08081919, 0x0808082b),
|
| 432 |
+
uvec2(0x08082b08, 0x0808082b), uvec2(0x08190819, 0x0808082b), uvec2(0x08191908, 0x0808082b), uvec2(0x082b0808, 0x0808082b),
|
| 433 |
+
uvec2(0x19080819, 0x0808082b), uvec2(0x19081908, 0x0808082b), uvec2(0x19190808, 0x0808082b), uvec2(0x19191919, 0x0808082b),
|
| 434 |
+
uvec2(0x2b080808, 0x0808082b), uvec2(0x2b082b2b, 0x0808082b), uvec2(0x08080819, 0x08081908), uvec2(0x08081908, 0x08081908),
|
| 435 |
+
uvec2(0x0808192b, 0x08081908), uvec2(0x08082b19, 0x08081908), uvec2(0x08190808, 0x08081908), uvec2(0x0819082b, 0x08081908),
|
| 436 |
+
uvec2(0x08191919, 0x08081908), uvec2(0x08192b08, 0x08081908), uvec2(0x082b0819, 0x08081908), uvec2(0x082b1908, 0x08081908),
|
| 437 |
+
uvec2(0x19080808, 0x08081908), uvec2(0x1908082b, 0x08081908), uvec2(0x19081919, 0x08081908), uvec2(0x19082b08, 0x08081908),
|
| 438 |
+
uvec2(0x19190819, 0x08081908), uvec2(0x19191908, 0x08081908), uvec2(0x1919192b, 0x08081908), uvec2(0x192b0808, 0x08081908),
|
| 439 |
+
uvec2(0x2b080819, 0x08081908), uvec2(0x2b081908, 0x08081908), uvec2(0x2b190808, 0x08081908), uvec2(0x08080808, 0x08081919),
|
| 440 |
+
uvec2(0x0808082b, 0x08081919), uvec2(0x08081919, 0x08081919), uvec2(0x08082b08, 0x08081919), uvec2(0x08190819, 0x08081919),
|
| 441 |
+
uvec2(0x08191908, 0x08081919), uvec2(0x082b0808, 0x08081919), uvec2(0x19080819, 0x08081919), uvec2(0x19081908, 0x08081919),
|
| 442 |
+
uvec2(0x19190808, 0x08081919), uvec2(0x192b0819, 0x08081919), uvec2(0x2b080808, 0x08081919), uvec2(0x08080819, 0x0808192b),
|
| 443 |
+
uvec2(0x08081908, 0x0808192b), uvec2(0x08190808, 0x0808192b), uvec2(0x082b192b, 0x0808192b), uvec2(0x19080808, 0x0808192b),
|
| 444 |
+
uvec2(0x1908082b, 0x0808192b), uvec2(0x2b081908, 0x0808192b), uvec2(0x08080808, 0x08082b08), uvec2(0x0808082b, 0x08082b08),
|
| 445 |
+
uvec2(0x08081919, 0x08082b08), uvec2(0x08082b08, 0x08082b08), uvec2(0x08082b2b, 0x08082b08), uvec2(0x08190819, 0x08082b08),
|
| 446 |
+
uvec2(0x08191908, 0x08082b08), uvec2(0x082b0808, 0x08082b08), uvec2(0x082b1919, 0x08082b08), uvec2(0x19080819, 0x08082b08),
|
| 447 |
+
uvec2(0x19081908, 0x08082b08), uvec2(0x19190808, 0x08082b08), uvec2(0x19192b08, 0x08082b08), uvec2(0x2b080808, 0x08082b08),
|
| 448 |
+
uvec2(0x2b2b0808, 0x08082b08), uvec2(0x2b2b2b2b, 0x08082b08), uvec2(0x08080819, 0x08082b19), uvec2(0x08081908, 0x08082b19),
|
| 449 |
+
uvec2(0x08190808, 0x08082b19), uvec2(0x19080808, 0x08082b19), uvec2(0x2b080819, 0x08082b19), uvec2(0x2b082b19, 0x08082b19),
|
| 450 |
+
uvec2(0x08080808, 0x08082b2b), uvec2(0x082b0808, 0x08082b2b), uvec2(0x082b2b08, 0x08082b2b), uvec2(0x2b19192b, 0x08082b2b),
|
| 451 |
+
uvec2(0x2b2b0808, 0x08082b2b), uvec2(0x08080819, 0x08190808), uvec2(0x08081908, 0x08190808), uvec2(0x0808192b, 0x08190808),
|
| 452 |
+
uvec2(0x08082b19, 0x08190808), uvec2(0x08190808, 0x08190808), uvec2(0x0819082b, 0x08190808), uvec2(0x08191919, 0x08190808),
|
| 453 |
+
uvec2(0x08192b08, 0x08190808), uvec2(0x082b0819, 0x08190808), uvec2(0x082b1908, 0x08190808), uvec2(0x19080808, 0x08190808),
|
| 454 |
+
uvec2(0x1908082b, 0x08190808), uvec2(0x19081919, 0x08190808), uvec2(0x19082b08, 0x08190808), uvec2(0x19190819, 0x08190808),
|
| 455 |
+
uvec2(0x19191908, 0x08190808), uvec2(0x192b0808, 0x08190808), uvec2(0x192b2b2b, 0x08190808), uvec2(0x2b080819, 0x08190808),
|
| 456 |
+
uvec2(0x2b081908, 0x08190808), uvec2(0x2b190808, 0x08190808), uvec2(0x08080808, 0x08190819), uvec2(0x0808082b, 0x08190819),
|
| 457 |
+
uvec2(0x08081919, 0x08190819), uvec2(0x08082b08, 0x08190819), uvec2(0x08190819, 0x08190819), uvec2(0x08191908, 0x08190819),
|
| 458 |
+
uvec2(0x082b0808, 0x08190819), uvec2(0x19080819, 0x08190819), uvec2(0x19081908, 0x08190819), uvec2(0x19190808, 0x08190819),
|
| 459 |
+
uvec2(0x2b080808, 0x08190819), uvec2(0x2b191908, 0x08190819), uvec2(0x2b19192b, 0x08190819), uvec2(0x08080819, 0x0819082b),
|
| 460 |
+
uvec2(0x08081908, 0x0819082b), uvec2(0x0808192b, 0x0819082b), uvec2(0x08190808, 0x0819082b), uvec2(0x19080808, 0x0819082b),
|
| 461 |
+
uvec2(0x192b0808, 0x0819082b), uvec2(0x08080808, 0x08191908), uvec2(0x0808082b, 0x08191908), uvec2(0x08081919, 0x08191908),
|
| 462 |
+
uvec2(0x08082b08, 0x08191908), uvec2(0x08190819, 0x08191908), uvec2(0x08191908, 0x08191908), uvec2(0x082b0808, 0x08191908),
|
| 463 |
+
uvec2(0x19080819, 0x08191908), uvec2(0x19081908, 0x08191908), uvec2(0x19082b19, 0x08191908), uvec2(0x19190808, 0x08191908),
|
| 464 |
+
uvec2(0x192b1908, 0x08191908), uvec2(0x2b080808, 0x08191908), uvec2(0x08080819, 0x08191919), uvec2(0x08081908, 0x08191919),
|
| 465 |
+
uvec2(0x08190808, 0x08191919), uvec2(0x19080808, 0x08191919), uvec2(0x08080808, 0x0819192b), uvec2(0x08191908, 0x0819192b),
|
| 466 |
+
uvec2(0x19082b19, 0x0819192b), uvec2(0x08080819, 0x08192b08), uvec2(0x08081908, 0x08192b08), uvec2(0x08190808, 0x08192b08),
|
| 467 |
+
uvec2(0x0819082b, 0x08192b08), uvec2(0x19080808, 0x08192b08), uvec2(0x19191908, 0x08192b08), uvec2(0x2b08192b, 0x08192b08),
|
| 468 |
+
uvec2(0x08080808, 0x08192b19), uvec2(0x08081919, 0x08192b19), uvec2(0x192b192b, 0x08192b19), uvec2(0x19190819, 0x08192b2b),
|
| 469 |
+
uvec2(0x2b2b2b19, 0x08192b2b), uvec2(0x08080808, 0x082b0808), uvec2(0x0808082b, 0x082b0808), uvec2(0x08081919, 0x082b0808),
|
| 470 |
+
uvec2(0x08082b08, 0x082b0808), uvec2(0x08082b2b, 0x082b0808), uvec2(0x08190819, 0x082b0808), uvec2(0x08191908, 0x082b0808),
|
| 471 |
+
uvec2(0x082b0808, 0x082b0808), uvec2(0x19080819, 0x082b0808), uvec2(0x19081908, 0x082b0808), uvec2(0x19190808, 0x082b0808),
|
| 472 |
+
uvec2(0x2b080808, 0x082b0808), uvec2(0x2b2b0808, 0x082b0808), uvec2(0x08080819, 0x082b0819), uvec2(0x08081908, 0x082b0819),
|
| 473 |
+
uvec2(0x08190808, 0x082b0819), uvec2(0x19080808, 0x082b0819), uvec2(0x19082b08, 0x082b0819), uvec2(0x192b1919, 0x082b0819),
|
| 474 |
+
uvec2(0x08080808, 0x082b082b), uvec2(0x082b082b, 0x082b082b), uvec2(0x2b080808, 0x082b082b), uvec2(0x2b2b2b08, 0x082b082b),
|
| 475 |
+
uvec2(0x08080819, 0x082b1908), uvec2(0x08081908, 0x082b1908), uvec2(0x08190808, 0x082b1908), uvec2(0x082b2b19, 0x082b1908),
|
| 476 |
+
uvec2(0x19080808, 0x082b1908), uvec2(0x08080808, 0x082b1919), uvec2(0x19080819, 0x082b1919), uvec2(0x1919082b, 0x082b1919),
|
| 477 |
+
uvec2(0x2b192b19, 0x082b1919), uvec2(0x08080819, 0x082b192b), uvec2(0x08192b2b, 0x082b192b), uvec2(0x2b2b192b, 0x082b192b),
|
| 478 |
+
uvec2(0x08080808, 0x082b2b08), uvec2(0x08082b08, 0x082b2b08), uvec2(0x08082b2b, 0x082b2b08), uvec2(0x082b0808, 0x082b2b08),
|
| 479 |
+
uvec2(0x19191919, 0x082b2b08), uvec2(0x2b082b08, 0x082b2b08), uvec2(0x2b2b082b, 0x082b2b08), uvec2(0x192b2b08, 0x082b2b19),
|
| 480 |
+
uvec2(0x2b190808, 0x082b2b19), uvec2(0x08082b08, 0x082b2b2b), uvec2(0x082b0808, 0x082b2b2b), uvec2(0x2b08082b, 0x082b2b2b),
|
| 481 |
+
uvec2(0x2b082b08, 0x082b2b2b), uvec2(0x2b082b2b, 0x082b2b2b), uvec2(0x08080819, 0x19080808), uvec2(0x08081908, 0x19080808),
|
| 482 |
+
uvec2(0x0808192b, 0x19080808), uvec2(0x08082b19, 0x19080808), uvec2(0x08190808, 0x19080808), uvec2(0x0819082b, 0x19080808),
|
| 483 |
+
uvec2(0x08191919, 0x19080808), uvec2(0x08192b08, 0x19080808), uvec2(0x082b0819, 0x19080808), uvec2(0x082b1908, 0x19080808),
|
| 484 |
+
uvec2(0x19080808, 0x19080808), uvec2(0x1908082b, 0x19080808), uvec2(0x19081919, 0x19080808), uvec2(0x19082b08, 0x19080808),
|
| 485 |
+
uvec2(0x19082b2b, 0x19080808), uvec2(0x19190819, 0x19080808), uvec2(0x19191908, 0x19080808), uvec2(0x192b0808, 0x19080808),
|
| 486 |
+
uvec2(0x192b1919, 0x19080808), uvec2(0x2b080819, 0x19080808), uvec2(0x2b081908, 0x19080808), uvec2(0x2b190808, 0x19080808),
|
| 487 |
+
uvec2(0x08080808, 0x19080819), uvec2(0x0808082b, 0x19080819), uvec2(0x08081919, 0x19080819), uvec2(0x08082b08, 0x19080819),
|
| 488 |
+
uvec2(0x08190819, 0x19080819), uvec2(0x08191908, 0x19080819), uvec2(0x082b0808, 0x19080819), uvec2(0x19080819, 0x19080819),
|
| 489 |
+
uvec2(0x19081908, 0x19080819), uvec2(0x19190808, 0x19080819), uvec2(0x2b080808, 0x19080819), uvec2(0x2b081919, 0x19080819),
|
| 490 |
+
uvec2(0x2b2b082b, 0x19080819), uvec2(0x08080819, 0x1908082b), uvec2(0x08081908, 0x1908082b), uvec2(0x08190808, 0x1908082b),
|
| 491 |
+
uvec2(0x0819082b, 0x1908082b), uvec2(0x082b2b19, 0x1908082b), uvec2(0x19080808, 0x1908082b), uvec2(0x08080808, 0x19081908),
|
| 492 |
+
uvec2(0x0808082b, 0x19081908), uvec2(0x08081919, 0x19081908), uvec2(0x08082b08, 0x19081908), uvec2(0x08190819, 0x19081908),
|
| 493 |
+
uvec2(0x08191908, 0x19081908), uvec2(0x08192b19, 0x19081908), uvec2(0x082b0808, 0x19081908), uvec2(0x19080819, 0x19081908),
|
| 494 |
+
uvec2(0x19081908, 0x19081908), uvec2(0x19190808, 0x19081908), uvec2(0x2b080808, 0x19081908), uvec2(0x2b191908, 0x19081908),
|
| 495 |
+
uvec2(0x08080819, 0x19081919), uvec2(0x08081908, 0x19081919), uvec2(0x08190808, 0x19081919), uvec2(0x082b1908, 0x19081919),
|
| 496 |
+
uvec2(0x19080808, 0x19081919), uvec2(0x2b192b2b, 0x19081919), uvec2(0x08080808, 0x1908192b), uvec2(0x08082b2b, 0x1908192b),
|
| 497 |
+
uvec2(0x19081908, 0x1908192b), uvec2(0x19190808, 0x1908192b), uvec2(0x08080819, 0x19082b08), uvec2(0x08081908, 0x19082b08),
|
| 498 |
+
uvec2(0x08190808, 0x19082b08), uvec2(0x19080808, 0x19082b08), uvec2(0x19081919, 0x19082b08), uvec2(0x19191908, 0x19082b08),
|
| 499 |
+
uvec2(0x192b082b, 0x19082b08), uvec2(0x08080808, 0x19082b19), uvec2(0x08190819, 0x19082b19), uvec2(0x19081908, 0x19082b19),
|
| 500 |
+
uvec2(0x19190808, 0x19082b19), uvec2(0x192b2b19, 0x19082b19), uvec2(0x08081908, 0x19082b2b), uvec2(0x08080808, 0x19190808),
|
| 501 |
+
uvec2(0x0808082b, 0x19190808), uvec2(0x08081919, 0x19190808), uvec2(0x08082b08, 0x19190808), uvec2(0x08190819, 0x19190808),
|
| 502 |
+
uvec2(0x08191908, 0x19190808), uvec2(0x082b0808, 0x19190808), uvec2(0x082b2b08, 0x19190808), uvec2(0x19080819, 0x19190808),
|
| 503 |
+
uvec2(0x19081908, 0x19190808), uvec2(0x19190808, 0x19190808), uvec2(0x2b080808, 0x19190808), uvec2(0x08080819, 0x19190819),
|
| 504 |
+
uvec2(0x08081908, 0x19190819), uvec2(0x08190808, 0x19190819), uvec2(0x08191919, 0x19190819), uvec2(0x19080808, 0x19190819),
|
| 505 |
+
uvec2(0x1908082b, 0x19190819), uvec2(0x08080808, 0x1919082b), uvec2(0x19081908, 0x1919082b), uvec2(0x2b2b2b2b, 0x1919082b),
|
| 506 |
+
uvec2(0x08080819, 0x19191908), uvec2(0x08081908, 0x19191908), uvec2(0x08190808, 0x19191908), uvec2(0x082b0819, 0x19191908),
|
| 507 |
+
uvec2(0x19080808, 0x19191908), uvec2(0x192b0808, 0x19191908), uvec2(0x2b080819, 0x19191908), uvec2(0x2b2b0819, 0x19191908),
|
| 508 |
+
uvec2(0x08080808, 0x19191919), uvec2(0x08082b08, 0x19191919), uvec2(0x2b080808, 0x19191919), uvec2(0x2b082b08, 0x19191919),
|
| 509 |
+
uvec2(0x082b0819, 0x1919192b), uvec2(0x192b2b08, 0x1919192b), uvec2(0x2b2b0819, 0x1919192b), uvec2(0x08080808, 0x19192b08),
|
| 510 |
+
uvec2(0x08191908, 0x19192b08), uvec2(0x19080819, 0x19192b08), uvec2(0x19190808, 0x19192b08), uvec2(0x2b192b19, 0x19192b08),
|
| 511 |
+
uvec2(0x08192b2b, 0x19192b19), uvec2(0x19080808, 0x19192b19), uvec2(0x1908082b, 0x19192b19), uvec2(0x2b081919, 0x19192b2b),
|
| 512 |
+
uvec2(0x08080819, 0x192b0808), uvec2(0x08081908, 0x192b0808), uvec2(0x08190808, 0x192b0808), uvec2(0x19080808, 0x192b0808),
|
| 513 |
+
uvec2(0x19191908, 0x192b0808), uvec2(0x192b082b, 0x192b0808), uvec2(0x2b08192b, 0x192b0808), uvec2(0x2b2b2b19, 0x192b0808),
|
| 514 |
+
uvec2(0x08080808, 0x192b0819), uvec2(0x082b1908, 0x192b082b), uvec2(0x19082b2b, 0x192b082b), uvec2(0x2b19082b, 0x192b082b),
|
| 515 |
+
uvec2(0x08080808, 0x192b1908), uvec2(0x0819192b, 0x192b1908), uvec2(0x08190808, 0x192b1919), uvec2(0x19080808, 0x192b1919),
|
| 516 |
+
uvec2(0x19081919, 0x192b1919), uvec2(0x2b2b1908, 0x192b1919), uvec2(0x08080819, 0x192b2b08), uvec2(0x192b2b2b, 0x192b2b08),
|
| 517 |
+
uvec2(0x082b1919, 0x192b2b19), uvec2(0x0808192b, 0x192b2b2b), uvec2(0x19191908, 0x192b2b2b), uvec2(0x192b082b, 0x192b2b2b),
|
| 518 |
+
uvec2(0x08080808, 0x2b080808), uvec2(0x0808082b, 0x2b080808), uvec2(0x08081919, 0x2b080808), uvec2(0x08082b08, 0x2b080808),
|
| 519 |
+
uvec2(0x08190819, 0x2b080808), uvec2(0x08191908, 0x2b080808), uvec2(0x082b0808, 0x2b080808), uvec2(0x082b2b2b, 0x2b080808),
|
| 520 |
+
uvec2(0x19080819, 0x2b080808), uvec2(0x19081908, 0x2b080808), uvec2(0x19190808, 0x2b080808), uvec2(0x2b080808, 0x2b080808),
|
| 521 |
+
uvec2(0x2b08082b, 0x2b080808), uvec2(0x2b2b2b08, 0x2b080808), uvec2(0x2b2b2b2b, 0x2b080808), uvec2(0x08080819, 0x2b080819),
|
| 522 |
+
uvec2(0x08081908, 0x2b080819), uvec2(0x0808192b, 0x2b080819), uvec2(0x08190808, 0x2b080819), uvec2(0x19080808, 0x2b080819),
|
| 523 |
+
uvec2(0x19190819, 0x2b080819), uvec2(0x19192b19, 0x2b080819), uvec2(0x08080808, 0x2b08082b), uvec2(0x082b0808, 0x2b08082b),
|
| 524 |
+
uvec2(0x2b080808, 0x2b08082b), uvec2(0x2b08082b, 0x2b08082b), uvec2(0x2b2b0808, 0x2b08082b), uvec2(0x2b2b2b08, 0x2b08082b),
|
| 525 |
+
uvec2(0x08080819, 0x2b081908), uvec2(0x08081908, 0x2b081908), uvec2(0x08190808, 0x2b081908), uvec2(0x0819082b, 0x2b081908),
|
| 526 |
+
uvec2(0x08191919, 0x2b081908), uvec2(0x19080808, 0x2b081908), uvec2(0x192b0808, 0x2b081908), uvec2(0x2b082b19, 0x2b081908),
|
| 527 |
+
uvec2(0x08080808, 0x2b081919), uvec2(0x19081908, 0x2b081919), uvec2(0x2b2b1919, 0x2b081919), uvec2(0x08192b08, 0x2b08192b),
|
| 528 |
+
uvec2(0x192b2b2b, 0x2b08192b), uvec2(0x08080808, 0x2b082b08), uvec2(0x08082b08, 0x2b082b08), uvec2(0x082b1919, 0x2b082b08),
|
| 529 |
+
uvec2(0x19192b2b, 0x2b082b08), uvec2(0x2b080808, 0x2b082b08), uvec2(0x2b08082b, 0x2b082b08), uvec2(0x2b2b2b08, 0x2b082b08),
|
| 530 |
+
uvec2(0x0808192b, 0x2b082b19), uvec2(0x082b082b, 0x2b082b2b), uvec2(0x2b080808, 0x2b082b2b), uvec2(0x2b082b08, 0x2b082b2b),
|
| 531 |
+
uvec2(0x2b19192b, 0x2b082b2b), uvec2(0x2b2b2b08, 0x2b082b2b), uvec2(0x08080819, 0x2b190808), uvec2(0x08081908, 0x2b190808),
|
| 532 |
+
uvec2(0x08190808, 0x2b190808), uvec2(0x19080808, 0x2b190808), uvec2(0x1919192b, 0x2b190808), uvec2(0x2b081908, 0x2b190808),
|
| 533 |
+
uvec2(0x08080808, 0x2b190819), uvec2(0x082b082b, 0x2b190819), uvec2(0x192b1908, 0x2b190819), uvec2(0x1919192b, 0x2b19082b),
|
| 534 |
+
uvec2(0x2b082b19, 0x2b19082b), uvec2(0x08080808, 0x2b191908), uvec2(0x08081919, 0x2b191908), uvec2(0x19081908, 0x2b191908),
|
| 535 |
+
uvec2(0x19190808, 0x2b191908), uvec2(0x19192b08, 0x2b191908), uvec2(0x082b2b19, 0x2b191919), uvec2(0x2b190808, 0x2b191919),
|
| 536 |
+
uvec2(0x2b19082b, 0x2b191919), uvec2(0x19080819, 0x2b19192b), uvec2(0x19190819, 0x2b192b08), uvec2(0x2b2b192b, 0x2b192b08),
|
| 537 |
+
uvec2(0x19082b19, 0x2b192b19), uvec2(0x08191919, 0x2b192b2b), uvec2(0x192b0808, 0x2b192b2b), uvec2(0x08080808, 0x2b2b0808),
|
| 538 |
+
uvec2(0x0808082b, 0x2b2b0808), uvec2(0x08082b08, 0x2b2b0808), uvec2(0x08082b2b, 0x2b2b0808), uvec2(0x082b0808, 0x2b2b0808),
|
| 539 |
+
uvec2(0x082b2b2b, 0x2b2b0808), uvec2(0x2b2b0808, 0x2b2b0808), uvec2(0x19190819, 0x2b2b0819), uvec2(0x19192b19, 0x2b2b0819),
|
| 540 |
+
uvec2(0x2b2b192b, 0x2b2b0819), uvec2(0x08080808, 0x2b2b082b), uvec2(0x0808082b, 0x2b2b082b), uvec2(0x08082b08, 0x2b2b082b),
|
| 541 |
+
uvec2(0x082b2b2b, 0x2b2b082b), uvec2(0x2b080808, 0x2b2b082b), uvec2(0x2b2b0808, 0x2b2b082b), uvec2(0x19080808, 0x2b2b1908),
|
| 542 |
+
uvec2(0x2b191919, 0x2b2b1908), uvec2(0x192b1919, 0x2b2b192b), uvec2(0x2b192b08, 0x2b2b192b), uvec2(0x08082b2b, 0x2b2b2b08),
|
| 543 |
+
uvec2(0x082b0808, 0x2b2b2b08), uvec2(0x082b082b, 0x2b2b2b08), uvec2(0x082b2b08, 0x2b2b2b08), uvec2(0x2b2b0808, 0x2b2b2b08),
|
| 544 |
+
uvec2(0x2b2b2b08, 0x2b2b2b08), uvec2(0x08081908, 0x2b2b2b19), uvec2(0x2b081908, 0x2b2b2b19), uvec2(0x2b08192b, 0x2b2b2b19),
|
| 545 |
+
uvec2(0x082b2b08, 0x2b2b2b2b), uvec2(0x082b2b2b, 0x2b2b2b2b), uvec2(0x2b190819, 0x2b2b2b2b), uvec2(0x2b2b2b2b, 0x2b2b2b2b),
|
| 546 |
+
};
|
| 547 |
+
|
| 548 |
+
shared uvec2 iq2xs_grid[512];
|
| 549 |
+
|
| 550 |
+
void init_iq_shmem(uvec3 wgsize)
|
| 551 |
+
{
|
| 552 |
+
// copy the table into shared memory and sync
|
| 553 |
+
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += wgsize.x) {
|
| 554 |
+
iq2xs_grid[i] = iq2xs_grid_const[i];
|
| 555 |
+
}
|
| 556 |
+
barrier();
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
#define QUANT_K QUANT_K_IQ2_XS
|
| 560 |
+
#define QUANT_R QUANT_R_IQ2_XS
|
| 561 |
+
#define A_TYPE block_iq2_xs
|
| 562 |
+
#define A_TYPE_PACKED16 block_iq2_xs_packed16
|
| 563 |
+
#endif
|
| 564 |
+
|
| 565 |
+
#define QUANT_K_IQ2_S 256
|
| 566 |
+
#define QUANT_R_IQ2_S 1
|
| 567 |
+
|
| 568 |
+
struct block_iq2_s
|
| 569 |
+
{
|
| 570 |
+
float16_t d;
|
| 571 |
+
uint8_t qs[QUANT_K_IQ2_S/4];
|
| 572 |
+
uint8_t qh[QUANT_K_IQ2_S/32];
|
| 573 |
+
uint8_t scales[QUANT_K_IQ2_S/32];
|
| 574 |
+
};
|
| 575 |
+
|
| 576 |
+
#if defined(DATA_A_IQ2_S)
|
| 577 |
+
|
| 578 |
+
const uvec2 iq2s_grid_const[1024] = {
|
| 579 |
+
uvec2(0x08080808, 0x08080808), uvec2(0x0808082b, 0x08080808), uvec2(0x08081919, 0x08080808), uvec2(0x08082b08, 0x08080808),
|
| 580 |
+
uvec2(0x08082b2b, 0x08080808), uvec2(0x08190819, 0x08080808), uvec2(0x08191908, 0x08080808), uvec2(0x0819192b, 0x08080808),
|
| 581 |
+
uvec2(0x08192b19, 0x08080808), uvec2(0x082b0808, 0x08080808), uvec2(0x082b082b, 0x08080808), uvec2(0x082b1919, 0x08080808),
|
| 582 |
+
uvec2(0x082b2b08, 0x08080808), uvec2(0x19080819, 0x08080808), uvec2(0x19081908, 0x08080808), uvec2(0x1908192b, 0x08080808),
|
| 583 |
+
uvec2(0x19082b19, 0x08080808), uvec2(0x19190808, 0x08080808), uvec2(0x1919082b, 0x08080808), uvec2(0x19191919, 0x08080808),
|
| 584 |
+
uvec2(0x19192b08, 0x08080808), uvec2(0x192b0819, 0x08080808), uvec2(0x192b1908, 0x08080808), uvec2(0x192b192b, 0x08080808),
|
| 585 |
+
uvec2(0x192b2b19, 0x08080808), uvec2(0x2b080808, 0x08080808), uvec2(0x2b08082b, 0x08080808), uvec2(0x2b081919, 0x08080808),
|
| 586 |
+
uvec2(0x2b082b08, 0x08080808), uvec2(0x2b190819, 0x08080808), uvec2(0x2b191908, 0x08080808), uvec2(0x2b2b0808, 0x08080808),
|
| 587 |
+
uvec2(0x2b2b1919, 0x08080808), uvec2(0x2b2b2b2b, 0x08080808), uvec2(0x08080819, 0x08080819), uvec2(0x08081908, 0x08080819),
|
| 588 |
+
uvec2(0x0808192b, 0x08080819), uvec2(0x08082b19, 0x08080819), uvec2(0x08190808, 0x08080819), uvec2(0x0819082b, 0x08080819),
|
| 589 |
+
uvec2(0x08191919, 0x08080819), uvec2(0x08192b08, 0x08080819), uvec2(0x082b0819, 0x08080819), uvec2(0x082b1908, 0x08080819),
|
| 590 |
+
uvec2(0x19080808, 0x08080819), uvec2(0x1908082b, 0x08080819), uvec2(0x19081919, 0x08080819), uvec2(0x19082b08, 0x08080819),
|
| 591 |
+
uvec2(0x19190819, 0x08080819), uvec2(0x19191908, 0x08080819), uvec2(0x1919192b, 0x08080819), uvec2(0x19192b19, 0x08080819),
|
| 592 |
+
uvec2(0x192b0808, 0x08080819), uvec2(0x192b1919, 0x08080819), uvec2(0x192b2b08, 0x08080819), uvec2(0x2b080819, 0x08080819),
|
| 593 |
+
uvec2(0x2b081908, 0x08080819), uvec2(0x2b190808, 0x08080819), uvec2(0x2b19082b, 0x08080819), uvec2(0x2b191919, 0x08080819),
|
| 594 |
+
uvec2(0x2b2b0819, 0x08080819), uvec2(0x2b2b1908, 0x08080819), uvec2(0x08080808, 0x0808082b), uvec2(0x0808082b, 0x0808082b),
|
| 595 |
+
uvec2(0x08081919, 0x0808082b), uvec2(0x08082b08, 0x0808082b), uvec2(0x08190819, 0x0808082b), uvec2(0x08191908, 0x0808082b),
|
| 596 |
+
uvec2(0x082b0808, 0x0808082b), uvec2(0x082b2b2b, 0x0808082b), uvec2(0x19080819, 0x0808082b), uvec2(0x19081908, 0x0808082b),
|
| 597 |
+
uvec2(0x1908192b, 0x0808082b), uvec2(0x19082b19, 0x0808082b), uvec2(0x19190808, 0x0808082b), uvec2(0x19191919, 0x0808082b),
|
| 598 |
+
uvec2(0x2b080808, 0x0808082b), uvec2(0x2b081919, 0x0808082b), uvec2(0x2b082b2b, 0x0808082b), uvec2(0x2b191908, 0x0808082b),
|
| 599 |
+
uvec2(0x2b2b082b, 0x0808082b), uvec2(0x08080819, 0x08081908), uvec2(0x08081908, 0x08081908), uvec2(0x0808192b, 0x08081908),
|
| 600 |
+
uvec2(0x08082b19, 0x08081908), uvec2(0x08190808, 0x08081908), uvec2(0x0819082b, 0x08081908), uvec2(0x08191919, 0x08081908),
|
| 601 |
+
uvec2(0x08192b08, 0x08081908), uvec2(0x082b0819, 0x08081908), uvec2(0x082b1908, 0x08081908), uvec2(0x082b192b, 0x08081908),
|
| 602 |
+
uvec2(0x082b2b19, 0x08081908), uvec2(0x19080808, 0x08081908), uvec2(0x1908082b, 0x08081908), uvec2(0x19081919, 0x08081908),
|
| 603 |
+
uvec2(0x19082b08, 0x08081908), uvec2(0x19082b2b, 0x08081908), uvec2(0x19190819, 0x08081908), uvec2(0x19191908, 0x08081908),
|
| 604 |
+
uvec2(0x1919192b, 0x08081908), uvec2(0x19192b19, 0x08081908), uvec2(0x192b0808, 0x08081908), uvec2(0x192b082b, 0x08081908),
|
| 605 |
+
uvec2(0x192b1919, 0x08081908), uvec2(0x2b080819, 0x08081908), uvec2(0x2b081908, 0x08081908), uvec2(0x2b08192b, 0x08081908),
|
| 606 |
+
uvec2(0x2b082b19, 0x08081908), uvec2(0x2b190808, 0x08081908), uvec2(0x2b191919, 0x08081908), uvec2(0x2b192b08, 0x08081908),
|
| 607 |
+
uvec2(0x2b2b0819, 0x08081908), uvec2(0x2b2b1908, 0x08081908), uvec2(0x08080808, 0x08081919), uvec2(0x0808082b, 0x08081919),
|
| 608 |
+
uvec2(0x08081919, 0x08081919), uvec2(0x08082b08, 0x08081919), uvec2(0x08082b2b, 0x08081919), uvec2(0x08190819, 0x08081919),
|
| 609 |
+
uvec2(0x08191908, 0x08081919), uvec2(0x0819192b, 0x08081919), uvec2(0x08192b19, 0x08081919), uvec2(0x082b0808, 0x08081919),
|
| 610 |
+
uvec2(0x082b1919, 0x08081919), uvec2(0x082b2b08, 0x08081919), uvec2(0x19080819, 0x08081919), uvec2(0x19081908, 0x08081919),
|
| 611 |
+
uvec2(0x1908192b, 0x08081919), uvec2(0x19082b19, 0x08081919), uvec2(0x19190808, 0x08081919), uvec2(0x1919082b, 0x08081919),
|
| 612 |
+
uvec2(0x19191919, 0x08081919), uvec2(0x19192b08, 0x08081919), uvec2(0x192b0819, 0x08081919), uvec2(0x192b1908, 0x08081919),
|
| 613 |
+
uvec2(0x2b080808, 0x08081919), uvec2(0x2b08082b, 0x08081919), uvec2(0x2b081919, 0x08081919), uvec2(0x2b082b08, 0x08081919),
|
| 614 |
+
uvec2(0x2b190819, 0x08081919), uvec2(0x2b191908, 0x08081919), uvec2(0x2b2b0808, 0x08081919), uvec2(0x08080819, 0x0808192b),
|
| 615 |
+
uvec2(0x08081908, 0x0808192b), uvec2(0x0808192b, 0x0808192b), uvec2(0x08082b19, 0x0808192b), uvec2(0x08190808, 0x0808192b),
|
| 616 |
+
uvec2(0x08191919, 0x0808192b), uvec2(0x19080808, 0x0808192b), uvec2(0x19081919, 0x0808192b), uvec2(0x19082b08, 0x0808192b),
|
| 617 |
+
uvec2(0x19190819, 0x0808192b), uvec2(0x19191908, 0x0808192b), uvec2(0x192b0808, 0x0808192b), uvec2(0x2b080819, 0x0808192b),
|
| 618 |
+
uvec2(0x2b081908, 0x0808192b), uvec2(0x2b190808, 0x0808192b), uvec2(0x08080808, 0x08082b08), uvec2(0x0808082b, 0x08082b08),
|
| 619 |
+
uvec2(0x08081919, 0x08082b08), uvec2(0x08082b08, 0x08082b08), uvec2(0x08190819, 0x08082b08), uvec2(0x08191908, 0x08082b08),
|
| 620 |
+
uvec2(0x0819192b, 0x08082b08), uvec2(0x08192b19, 0x08082b08), uvec2(0x082b0808, 0x08082b08), uvec2(0x082b1919, 0x08082b08),
|
| 621 |
+
uvec2(0x082b2b2b, 0x08082b08), uvec2(0x19080819, 0x08082b08), uvec2(0x19081908, 0x08082b08), uvec2(0x1908192b, 0x08082b08),
|
| 622 |
+
uvec2(0x19082b19, 0x08082b08), uvec2(0x19190808, 0x08082b08), uvec2(0x1919082b, 0x08082b08), uvec2(0x19191919, 0x08082b08),
|
| 623 |
+
uvec2(0x19192b08, 0x08082b08), uvec2(0x192b0819, 0x08082b08), uvec2(0x192b1908, 0x08082b08), uvec2(0x2b080808, 0x08082b08),
|
| 624 |
+
uvec2(0x2b081919, 0x08082b08), uvec2(0x2b191908, 0x08082b08), uvec2(0x2b2b2b2b, 0x08082b08), uvec2(0x08080819, 0x08082b19),
|
| 625 |
+
uvec2(0x08081908, 0x08082b19), uvec2(0x08190808, 0x08082b19), uvec2(0x0819082b, 0x08082b19), uvec2(0x08191919, 0x08082b19),
|
| 626 |
+
uvec2(0x08192b08, 0x08082b19), uvec2(0x082b0819, 0x08082b19), uvec2(0x19080808, 0x08082b19), uvec2(0x19081919, 0x08082b19),
|
| 627 |
+
uvec2(0x19082b08, 0x08082b19), uvec2(0x19190819, 0x08082b19), uvec2(0x19191908, 0x08082b19), uvec2(0x192b0808, 0x08082b19),
|
| 628 |
+
uvec2(0x2b080819, 0x08082b19), uvec2(0x2b190808, 0x08082b19), uvec2(0x08080808, 0x08082b2b), uvec2(0x08190819, 0x08082b2b),
|
| 629 |
+
uvec2(0x08191908, 0x08082b2b), uvec2(0x082b082b, 0x08082b2b), uvec2(0x082b2b08, 0x08082b2b), uvec2(0x082b2b2b, 0x08082b2b),
|
| 630 |
+
uvec2(0x19190808, 0x08082b2b), uvec2(0x2b192b19, 0x08082b2b), uvec2(0x08080819, 0x08190808), uvec2(0x08081908, 0x08190808),
|
| 631 |
+
uvec2(0x0808192b, 0x08190808), uvec2(0x08082b19, 0x08190808), uvec2(0x08190808, 0x08190808), uvec2(0x0819082b, 0x08190808),
|
| 632 |
+
uvec2(0x08191919, 0x08190808), uvec2(0x08192b08, 0x08190808), uvec2(0x082b0819, 0x08190808), uvec2(0x082b1908, 0x08190808),
|
| 633 |
+
uvec2(0x082b192b, 0x08190808), uvec2(0x19080808, 0x08190808), uvec2(0x1908082b, 0x08190808), uvec2(0x19081919, 0x08190808),
|
| 634 |
+
uvec2(0x19082b08, 0x08190808), uvec2(0x19190819, 0x08190808), uvec2(0x19191908, 0x08190808), uvec2(0x1919192b, 0x08190808),
|
| 635 |
+
uvec2(0x19192b19, 0x08190808), uvec2(0x192b0808, 0x08190808), uvec2(0x192b082b, 0x08190808), uvec2(0x192b1919, 0x08190808),
|
| 636 |
+
uvec2(0x192b2b08, 0x08190808), uvec2(0x2b080819, 0x08190808), uvec2(0x2b081908, 0x08190808), uvec2(0x2b08192b, 0x08190808),
|
| 637 |
+
uvec2(0x2b190808, 0x08190808), uvec2(0x2b191919, 0x08190808), uvec2(0x2b192b08, 0x08190808), uvec2(0x2b2b0819, 0x08190808),
|
| 638 |
+
uvec2(0x2b2b1908, 0x08190808), uvec2(0x08080808, 0x08190819), uvec2(0x0808082b, 0x08190819), uvec2(0x08081919, 0x08190819),
|
| 639 |
+
uvec2(0x08082b08, 0x08190819), uvec2(0x08082b2b, 0x08190819), uvec2(0x08190819, 0x08190819), uvec2(0x08191908, 0x08190819),
|
| 640 |
+
uvec2(0x0819192b, 0x08190819), uvec2(0x08192b19, 0x08190819), uvec2(0x082b0808, 0x08190819), uvec2(0x082b082b, 0x08190819),
|
| 641 |
+
uvec2(0x082b1919, 0x08190819), uvec2(0x082b2b08, 0x08190819), uvec2(0x19080819, 0x08190819), uvec2(0x19081908, 0x08190819),
|
| 642 |
+
uvec2(0x1908192b, 0x08190819), uvec2(0x19082b19, 0x08190819), uvec2(0x19190808, 0x08190819), uvec2(0x1919082b, 0x08190819),
|
| 643 |
+
uvec2(0x19191919, 0x08190819), uvec2(0x19192b08, 0x08190819), uvec2(0x192b0819, 0x08190819), uvec2(0x192b1908, 0x08190819),
|
| 644 |
+
uvec2(0x2b080808, 0x08190819), uvec2(0x2b08082b, 0x08190819), uvec2(0x2b081919, 0x08190819), uvec2(0x2b082b08, 0x08190819),
|
| 645 |
+
uvec2(0x2b190819, 0x08190819), uvec2(0x2b191908, 0x08190819), uvec2(0x08080819, 0x0819082b), uvec2(0x08081908, 0x0819082b),
|
| 646 |
+
uvec2(0x08082b19, 0x0819082b), uvec2(0x08190808, 0x0819082b), uvec2(0x08191919, 0x0819082b), uvec2(0x082b0819, 0x0819082b),
|
| 647 |
+
uvec2(0x082b1908, 0x0819082b), uvec2(0x19080808, 0x0819082b), uvec2(0x19081919, 0x0819082b), uvec2(0x19190819, 0x0819082b),
|
| 648 |
+
uvec2(0x19191908, 0x0819082b), uvec2(0x2b080819, 0x0819082b), uvec2(0x2b081908, 0x0819082b), uvec2(0x2b190808, 0x0819082b),
|
| 649 |
+
uvec2(0x08080808, 0x08191908), uvec2(0x0808082b, 0x08191908), uvec2(0x08081919, 0x08191908), uvec2(0x08082b08, 0x08191908),
|
| 650 |
+
uvec2(0x08190819, 0x08191908), uvec2(0x08191908, 0x08191908), uvec2(0x0819192b, 0x08191908), uvec2(0x08192b19, 0x08191908),
|
| 651 |
+
uvec2(0x082b0808, 0x08191908), uvec2(0x082b1919, 0x08191908), uvec2(0x082b2b08, 0x08191908), uvec2(0x19080819, 0x08191908),
|
| 652 |
+
uvec2(0x19081908, 0x08191908), uvec2(0x1908192b, 0x08191908), uvec2(0x19082b19, 0x08191908), uvec2(0x19190808, 0x08191908),
|
| 653 |
+
uvec2(0x1919082b, 0x08191908), uvec2(0x19191919, 0x08191908), uvec2(0x19192b08, 0x08191908), uvec2(0x192b0819, 0x08191908),
|
| 654 |
+
uvec2(0x192b1908, 0x08191908), uvec2(0x2b080808, 0x08191908), uvec2(0x2b08082b, 0x08191908), uvec2(0x2b081919, 0x08191908),
|
| 655 |
+
uvec2(0x2b082b08, 0x08191908), uvec2(0x2b190819, 0x08191908), uvec2(0x2b191908, 0x08191908), uvec2(0x2b2b0808, 0x08191908),
|
| 656 |
+
uvec2(0x08080819, 0x08191919), uvec2(0x08081908, 0x08191919), uvec2(0x0808192b, 0x08191919), uvec2(0x08082b19, 0x08191919),
|
| 657 |
+
uvec2(0x08190808, 0x08191919), uvec2(0x0819082b, 0x08191919), uvec2(0x08191919, 0x08191919), uvec2(0x08192b08, 0x08191919),
|
| 658 |
+
uvec2(0x082b0819, 0x08191919), uvec2(0x082b1908, 0x08191919), uvec2(0x19080808, 0x08191919), uvec2(0x1908082b, 0x08191919),
|
| 659 |
+
uvec2(0x19081919, 0x08191919), uvec2(0x19082b08, 0x08191919), uvec2(0x19190819, 0x08191919), uvec2(0x19191908, 0x08191919),
|
| 660 |
+
uvec2(0x192b0808, 0x08191919), uvec2(0x2b080819, 0x08191919), uvec2(0x2b081908, 0x08191919), uvec2(0x2b190808, 0x08191919),
|
| 661 |
+
uvec2(0x08080808, 0x0819192b), uvec2(0x08081919, 0x0819192b), uvec2(0x08082b08, 0x0819192b), uvec2(0x08190819, 0x0819192b),
|
| 662 |
+
uvec2(0x08191908, 0x0819192b), uvec2(0x082b0808, 0x0819192b), uvec2(0x19080819, 0x0819192b), uvec2(0x19081908, 0x0819192b),
|
| 663 |
+
uvec2(0x19190808, 0x0819192b), uvec2(0x2b080808, 0x0819192b), uvec2(0x2b2b2b2b, 0x0819192b), uvec2(0x08080819, 0x08192b08),
|
| 664 |
+
uvec2(0x08081908, 0x08192b08), uvec2(0x0808192b, 0x08192b08), uvec2(0x08082b19, 0x08192b08), uvec2(0x08190808, 0x08192b08),
|
| 665 |
+
uvec2(0x08191919, 0x08192b08), uvec2(0x08192b08, 0x08192b08), uvec2(0x082b0819, 0x08192b08), uvec2(0x19080808, 0x08192b08),
|
| 666 |
+
uvec2(0x1908082b, 0x08192b08), uvec2(0x19081919, 0x08192b08), uvec2(0x19082b08, 0x08192b08), uvec2(0x19190819, 0x08192b08),
|
| 667 |
+
uvec2(0x19191908, 0x08192b08), uvec2(0x192b0808, 0x08192b08), uvec2(0x2b080819, 0x08192b08), uvec2(0x2b081908, 0x08192b08),
|
| 668 |
+
uvec2(0x08080808, 0x08192b19), uvec2(0x0808082b, 0x08192b19), uvec2(0x08081919, 0x08192b19), uvec2(0x08082b08, 0x08192b19),
|
| 669 |
+
uvec2(0x08190819, 0x08192b19), uvec2(0x08191908, 0x08192b19), uvec2(0x082b0808, 0x08192b19), uvec2(0x19080819, 0x08192b19),
|
| 670 |
+
uvec2(0x19081908, 0x08192b19), uvec2(0x19190808, 0x08192b19), uvec2(0x192b2b19, 0x08192b19), uvec2(0x2b2b082b, 0x08192b19),
|
| 671 |
+
uvec2(0x08081908, 0x08192b2b), uvec2(0x08190808, 0x08192b2b), uvec2(0x19080808, 0x08192b2b), uvec2(0x1919192b, 0x08192b2b),
|
| 672 |
+
uvec2(0x08080808, 0x082b0808), uvec2(0x0808082b, 0x082b0808), uvec2(0x08081919, 0x082b0808), uvec2(0x08082b08, 0x082b0808),
|
| 673 |
+
uvec2(0x08190819, 0x082b0808), uvec2(0x08191908, 0x082b0808), uvec2(0x0819192b, 0x082b0808), uvec2(0x08192b19, 0x082b0808),
|
| 674 |
+
uvec2(0x082b0808, 0x082b0808), uvec2(0x082b1919, 0x082b0808), uvec2(0x082b2b2b, 0x082b0808), uvec2(0x19080819, 0x082b0808),
|
| 675 |
+
uvec2(0x19081908, 0x082b0808), uvec2(0x19190808, 0x082b0808), uvec2(0x1919082b, 0x082b0808), uvec2(0x19191919, 0x082b0808),
|
| 676 |
+
uvec2(0x192b1908, 0x082b0808), uvec2(0x2b080808, 0x082b0808), uvec2(0x2b082b2b, 0x082b0808), uvec2(0x2b191908, 0x082b0808),
|
| 677 |
+
uvec2(0x2b2b2b2b, 0x082b0808), uvec2(0x08080819, 0x082b0819), uvec2(0x08081908, 0x082b0819), uvec2(0x08190808, 0x082b0819),
|
| 678 |
+
uvec2(0x0819082b, 0x082b0819), uvec2(0x08191919, 0x082b0819), uvec2(0x082b0819, 0x082b0819), uvec2(0x19080808, 0x082b0819),
|
| 679 |
+
uvec2(0x1908082b, 0x082b0819), uvec2(0x19081919, 0x082b0819), uvec2(0x19190819, 0x082b0819), uvec2(0x19191908, 0x082b0819),
|
| 680 |
+
uvec2(0x192b0808, 0x082b0819), uvec2(0x2b080819, 0x082b0819), uvec2(0x2b081908, 0x082b0819), uvec2(0x2b190808, 0x082b0819),
|
| 681 |
+
uvec2(0x08080808, 0x082b082b), uvec2(0x08082b2b, 0x082b082b), uvec2(0x082b082b, 0x082b082b), uvec2(0x082b2b08, 0x082b082b),
|
| 682 |
+
uvec2(0x082b2b2b, 0x082b082b), uvec2(0x19081908, 0x082b082b), uvec2(0x19190808, 0x082b082b), uvec2(0x2b082b08, 0x082b082b),
|
| 683 |
+
uvec2(0x2b082b2b, 0x082b082b), uvec2(0x2b2b2b08, 0x082b082b), uvec2(0x08080819, 0x082b1908), uvec2(0x08081908, 0x082b1908),
|
| 684 |
+
uvec2(0x0808192b, 0x082b1908), uvec2(0x08082b19, 0x082b1908), uvec2(0x08190808, 0x082b1908), uvec2(0x08191919, 0x082b1908),
|
| 685 |
+
uvec2(0x08192b08, 0x082b1908), uvec2(0x082b0819, 0x082b1908), uvec2(0x082b1908, 0x082b1908), uvec2(0x19080808, 0x082b1908),
|
| 686 |
+
uvec2(0x1908082b, 0x082b1908), uvec2(0x19081919, 0x082b1908), uvec2(0x19082b08, 0x082b1908), uvec2(0x19190819, 0x082b1908),
|
| 687 |
+
uvec2(0x19191908, 0x082b1908), uvec2(0x192b0808, 0x082b1908), uvec2(0x2b080819, 0x082b1908), uvec2(0x2b081908, 0x082b1908),
|
| 688 |
+
uvec2(0x2b190808, 0x082b1908), uvec2(0x08080808, 0x082b1919), uvec2(0x08081919, 0x082b1919), uvec2(0x08082b08, 0x082b1919),
|
| 689 |
+
uvec2(0x08190819, 0x082b1919), uvec2(0x08191908, 0x082b1919), uvec2(0x082b0808, 0x082b1919), uvec2(0x19080819, 0x082b1919),
|
| 690 |
+
uvec2(0x19081908, 0x082b1919), uvec2(0x19190808, 0x082b1919), uvec2(0x192b192b, 0x082b1919), uvec2(0x2b080808, 0x082b1919),
|
| 691 |
+
uvec2(0x08080819, 0x082b192b), uvec2(0x08081908, 0x082b192b), uvec2(0x08190808, 0x082b192b), uvec2(0x19080808, 0x082b192b),
|
| 692 |
+
uvec2(0x19192b19, 0x082b192b), uvec2(0x08080808, 0x082b2b08), uvec2(0x08081919, 0x082b2b08), uvec2(0x08190819, 0x082b2b08),
|
| 693 |
+
uvec2(0x08191908, 0x082b2b08), uvec2(0x19080819, 0x082b2b08), uvec2(0x19081908, 0x082b2b08), uvec2(0x19190808, 0x082b2b08),
|
| 694 |
+
uvec2(0x2b082b2b, 0x082b2b08), uvec2(0x2b2b2b2b, 0x082b2b08), uvec2(0x08080819, 0x082b2b19), uvec2(0x08081908, 0x082b2b19),
|
| 695 |
+
uvec2(0x08190808, 0x082b2b19), uvec2(0x2b191919, 0x082b2b19), uvec2(0x08082b2b, 0x082b2b2b), uvec2(0x082b082b, 0x082b2b2b),
|
| 696 |
+
uvec2(0x192b1908, 0x082b2b2b), uvec2(0x2b082b08, 0x082b2b2b), uvec2(0x2b082b2b, 0x082b2b2b), uvec2(0x08080819, 0x19080808),
|
| 697 |
+
uvec2(0x08081908, 0x19080808), uvec2(0x0808192b, 0x19080808), uvec2(0x08082b19, 0x19080808), uvec2(0x08190808, 0x19080808),
|
| 698 |
+
uvec2(0x0819082b, 0x19080808), uvec2(0x08191919, 0x19080808), uvec2(0x08192b08, 0x19080808), uvec2(0x08192b2b, 0x19080808),
|
| 699 |
+
uvec2(0x082b0819, 0x19080808), uvec2(0x082b1908, 0x19080808), uvec2(0x082b192b, 0x19080808), uvec2(0x19080808, 0x19080808),
|
| 700 |
+
uvec2(0x1908082b, 0x19080808), uvec2(0x19081919, 0x19080808), uvec2(0x19082b08, 0x19080808), uvec2(0x19082b2b, 0x19080808),
|
| 701 |
+
uvec2(0x19190819, 0x19080808), uvec2(0x19191908, 0x19080808), uvec2(0x1919192b, 0x19080808), uvec2(0x19192b19, 0x19080808),
|
| 702 |
+
uvec2(0x192b0808, 0x19080808), uvec2(0x192b082b, 0x19080808), uvec2(0x192b1919, 0x19080808), uvec2(0x2b080819, 0x19080808),
|
| 703 |
+
uvec2(0x2b081908, 0x19080808), uvec2(0x2b190808, 0x19080808), uvec2(0x2b191919, 0x19080808), uvec2(0x2b192b08, 0x19080808),
|
| 704 |
+
uvec2(0x2b2b0819, 0x19080808), uvec2(0x2b2b1908, 0x19080808), uvec2(0x08080808, 0x19080819), uvec2(0x0808082b, 0x19080819),
|
| 705 |
+
uvec2(0x08081919, 0x19080819), uvec2(0x08082b08, 0x19080819), uvec2(0x08190819, 0x19080819), uvec2(0x08191908, 0x19080819),
|
| 706 |
+
uvec2(0x0819192b, 0x19080819), uvec2(0x08192b19, 0x19080819), uvec2(0x082b0808, 0x19080819), uvec2(0x082b082b, 0x19080819),
|
| 707 |
+
uvec2(0x082b1919, 0x19080819), uvec2(0x19080819, 0x19080819), uvec2(0x19081908, 0x19080819), uvec2(0x1908192b, 0x19080819),
|
| 708 |
+
uvec2(0x19082b19, 0x19080819), uvec2(0x19190808, 0x19080819), uvec2(0x1919082b, 0x19080819), uvec2(0x19191919, 0x19080819),
|
| 709 |
+
uvec2(0x19192b08, 0x19080819), uvec2(0x192b0819, 0x19080819), uvec2(0x192b1908, 0x19080819), uvec2(0x2b080808, 0x19080819),
|
| 710 |
+
uvec2(0x2b08082b, 0x19080819), uvec2(0x2b081919, 0x19080819), uvec2(0x2b082b08, 0x19080819), uvec2(0x2b190819, 0x19080819),
|
| 711 |
+
uvec2(0x2b191908, 0x19080819), uvec2(0x2b2b0808, 0x19080819), uvec2(0x08080819, 0x1908082b), uvec2(0x08081908, 0x1908082b),
|
| 712 |
+
uvec2(0x08190808, 0x1908082b), uvec2(0x0819082b, 0x1908082b), uvec2(0x08191919, 0x1908082b), uvec2(0x08192b08, 0x1908082b),
|
| 713 |
+
uvec2(0x082b1908, 0x1908082b), uvec2(0x19080808, 0x1908082b), uvec2(0x19081919, 0x1908082b), uvec2(0x19082b08, 0x1908082b),
|
| 714 |
+
uvec2(0x19190819, 0x1908082b), uvec2(0x19191908, 0x1908082b), uvec2(0x192b0808, 0x1908082b), uvec2(0x2b080819, 0x1908082b),
|
| 715 |
+
uvec2(0x2b081908, 0x1908082b), uvec2(0x08080808, 0x19081908), uvec2(0x0808082b, 0x19081908), uvec2(0x08081919, 0x19081908),
|
| 716 |
+
uvec2(0x08082b08, 0x19081908), uvec2(0x08082b2b, 0x19081908), uvec2(0x08190819, 0x19081908), uvec2(0x08191908, 0x19081908),
|
| 717 |
+
uvec2(0x0819192b, 0x19081908), uvec2(0x08192b19, 0x19081908), uvec2(0x082b0808, 0x19081908), uvec2(0x082b082b, 0x19081908),
|
| 718 |
+
uvec2(0x082b1919, 0x19081908), uvec2(0x082b2b08, 0x19081908), uvec2(0x19080819, 0x19081908), uvec2(0x19081908, 0x19081908),
|
| 719 |
+
uvec2(0x1908192b, 0x19081908), uvec2(0x19082b19, 0x19081908), uvec2(0x19190808, 0x19081908), uvec2(0x1919082b, 0x19081908),
|
| 720 |
+
uvec2(0x19191919, 0x19081908), uvec2(0x19192b08, 0x19081908), uvec2(0x192b0819, 0x19081908), uvec2(0x192b1908, 0x19081908),
|
| 721 |
+
uvec2(0x2b080808, 0x19081908), uvec2(0x2b08082b, 0x19081908), uvec2(0x2b081919, 0x19081908), uvec2(0x2b082b08, 0x19081908),
|
| 722 |
+
uvec2(0x2b190819, 0x19081908), uvec2(0x2b191908, 0x19081908), uvec2(0x2b2b0808, 0x19081908), uvec2(0x08080819, 0x19081919),
|
| 723 |
+
uvec2(0x08081908, 0x19081919), uvec2(0x0808192b, 0x19081919), uvec2(0x08082b19, 0x19081919), uvec2(0x08190808, 0x19081919),
|
| 724 |
+
uvec2(0x0819082b, 0x19081919), uvec2(0x08191919, 0x19081919), uvec2(0x08192b08, 0x19081919), uvec2(0x082b0819, 0x19081919),
|
| 725 |
+
uvec2(0x082b1908, 0x19081919), uvec2(0x19080808, 0x19081919), uvec2(0x1908082b, 0x19081919), uvec2(0x19081919, 0x19081919),
|
| 726 |
+
uvec2(0x19082b08, 0x19081919), uvec2(0x19190819, 0x19081919), uvec2(0x19191908, 0x19081919), uvec2(0x192b0808, 0x19081919),
|
| 727 |
+
uvec2(0x192b2b2b, 0x19081919), uvec2(0x2b080819, 0x19081919), uvec2(0x2b081908, 0x19081919), uvec2(0x2b190808, 0x19081919),
|
| 728 |
+
uvec2(0x08080808, 0x1908192b), uvec2(0x0808082b, 0x1908192b), uvec2(0x08081919, 0x1908192b), uvec2(0x08082b08, 0x1908192b),
|
| 729 |
+
uvec2(0x08190819, 0x1908192b), uvec2(0x08191908, 0x1908192b), uvec2(0x082b0808, 0x1908192b), uvec2(0x19080819, 0x1908192b),
|
| 730 |
+
uvec2(0x19081908, 0x1908192b), uvec2(0x19190808, 0x1908192b), uvec2(0x2b080808, 0x1908192b), uvec2(0x2b2b1919, 0x1908192b),
|
| 731 |
+
uvec2(0x08080819, 0x19082b08), uvec2(0x08081908, 0x19082b08), uvec2(0x08082b19, 0x19082b08), uvec2(0x08190808, 0x19082b08),
|
| 732 |
+
uvec2(0x0819082b, 0x19082b08), uvec2(0x08191919, 0x19082b08), uvec2(0x08192b08, 0x19082b08), uvec2(0x082b0819, 0x19082b08),
|
| 733 |
+
uvec2(0x082b1908, 0x19082b08), uvec2(0x19080808, 0x19082b08), uvec2(0x1908082b, 0x19082b08), uvec2(0x19081919, 0x19082b08),
|
| 734 |
+
uvec2(0x19082b08, 0x19082b08), uvec2(0x19190819, 0x19082b08), uvec2(0x19191908, 0x19082b08), uvec2(0x192b0808, 0x19082b08),
|
| 735 |
+
uvec2(0x2b081908, 0x19082b08), uvec2(0x2b190808, 0x19082b08), uvec2(0x08080808, 0x19082b19), uvec2(0x0808082b, 0x19082b19),
|
| 736 |
+
uvec2(0x08081919, 0x19082b19), uvec2(0x08082b08, 0x19082b19), uvec2(0x08190819, 0x19082b19), uvec2(0x08191908, 0x19082b19),
|
| 737 |
+
uvec2(0x082b0808, 0x19082b19), uvec2(0x19080819, 0x19082b19), uvec2(0x19081908, 0x19082b19), uvec2(0x19190808, 0x19082b19),
|
| 738 |
+
uvec2(0x2b080808, 0x19082b19), uvec2(0x2b19192b, 0x19082b19), uvec2(0x08080819, 0x19082b2b), uvec2(0x08081908, 0x19082b2b),
|
| 739 |
+
uvec2(0x08190808, 0x19082b2b), uvec2(0x19080808, 0x19082b2b), uvec2(0x08080808, 0x19190808), uvec2(0x0808082b, 0x19190808),
|
| 740 |
+
uvec2(0x08081919, 0x19190808), uvec2(0x08082b08, 0x19190808), uvec2(0x08190819, 0x19190808), uvec2(0x08191908, 0x19190808),
|
| 741 |
+
uvec2(0x0819192b, 0x19190808), uvec2(0x08192b19, 0x19190808), uvec2(0x082b0808, 0x19190808), uvec2(0x082b082b, 0x19190808),
|
| 742 |
+
uvec2(0x082b1919, 0x19190808), uvec2(0x082b2b08, 0x19190808), uvec2(0x19080819, 0x19190808), uvec2(0x19081908, 0x19190808),
|
| 743 |
+
uvec2(0x1908192b, 0x19190808), uvec2(0x19082b19, 0x19190808), uvec2(0x19190808, 0x19190808), uvec2(0x1919082b, 0x19190808),
|
| 744 |
+
uvec2(0x19191919, 0x19190808), uvec2(0x19192b08, 0x19190808), uvec2(0x192b0819, 0x19190808), uvec2(0x192b1908, 0x19190808),
|
| 745 |
+
uvec2(0x2b080808, 0x19190808), uvec2(0x2b08082b, 0x19190808), uvec2(0x2b081919, 0x19190808), uvec2(0x2b082b08, 0x19190808),
|
| 746 |
+
uvec2(0x2b190819, 0x19190808), uvec2(0x2b191908, 0x19190808), uvec2(0x08080819, 0x19190819), uvec2(0x08081908, 0x19190819),
|
| 747 |
+
uvec2(0x0808192b, 0x19190819), uvec2(0x08082b19, 0x19190819), uvec2(0x08190808, 0x19190819), uvec2(0x0819082b, 0x19190819),
|
| 748 |
+
uvec2(0x08191919, 0x19190819), uvec2(0x08192b08, 0x19190819), uvec2(0x082b0819, 0x19190819), uvec2(0x082b1908, 0x19190819),
|
| 749 |
+
uvec2(0x19080808, 0x19190819), uvec2(0x1908082b, 0x19190819), uvec2(0x19081919, 0x19190819), uvec2(0x19082b08, 0x19190819),
|
| 750 |
+
uvec2(0x19190819, 0x19190819), uvec2(0x19191908, 0x19190819), uvec2(0x192b0808, 0x19190819), uvec2(0x2b080819, 0x19190819),
|
| 751 |
+
uvec2(0x2b081908, 0x19190819), uvec2(0x2b190808, 0x19190819), uvec2(0x08080808, 0x1919082b), uvec2(0x08081919, 0x1919082b),
|
| 752 |
+
uvec2(0x08082b08, 0x1919082b), uvec2(0x08190819, 0x1919082b), uvec2(0x08191908, 0x1919082b), uvec2(0x082b0808, 0x1919082b),
|
| 753 |
+
uvec2(0x19080819, 0x1919082b), uvec2(0x19081908, 0x1919082b), uvec2(0x19190808, 0x1919082b), uvec2(0x192b2b19, 0x1919082b),
|
| 754 |
+
uvec2(0x2b080808, 0x1919082b), uvec2(0x08080819, 0x19191908), uvec2(0x08081908, 0x19191908), uvec2(0x0808192b, 0x19191908),
|
| 755 |
+
uvec2(0x08082b19, 0x19191908), uvec2(0x08190808, 0x19191908), uvec2(0x0819082b, 0x19191908), uvec2(0x08191919, 0x19191908),
|
| 756 |
+
uvec2(0x08192b08, 0x19191908), uvec2(0x082b0819, 0x19191908), uvec2(0x082b1908, 0x19191908), uvec2(0x19080808, 0x19191908),
|
| 757 |
+
uvec2(0x1908082b, 0x19191908), uvec2(0x19081919, 0x19191908), uvec2(0x19082b08, 0x19191908), uvec2(0x19190819, 0x19191908),
|
| 758 |
+
uvec2(0x19191908, 0x19191908), uvec2(0x192b0808, 0x19191908), uvec2(0x2b080819, 0x19191908), uvec2(0x2b081908, 0x19191908),
|
| 759 |
+
uvec2(0x2b190808, 0x19191908), uvec2(0x08080808, 0x19191919), uvec2(0x0808082b, 0x19191919), uvec2(0x08081919, 0x19191919),
|
| 760 |
+
uvec2(0x08082b08, 0x19191919), uvec2(0x08190819, 0x19191919), uvec2(0x08191908, 0x19191919), uvec2(0x082b0808, 0x19191919),
|
| 761 |
+
uvec2(0x19080819, 0x19191919), uvec2(0x19081908, 0x19191919), uvec2(0x19190808, 0x19191919), uvec2(0x2b080808, 0x19191919),
|
| 762 |
+
uvec2(0x08080819, 0x1919192b), uvec2(0x08081908, 0x1919192b), uvec2(0x08190808, 0x1919192b), uvec2(0x082b192b, 0x1919192b),
|
| 763 |
+
uvec2(0x19080808, 0x1919192b), uvec2(0x08080808, 0x19192b08), uvec2(0x0808082b, 0x19192b08), uvec2(0x08081919, 0x19192b08),
|
| 764 |
+
uvec2(0x08082b08, 0x19192b08), uvec2(0x08190819, 0x19192b08), uvec2(0x08191908, 0x19192b08), uvec2(0x082b0808, 0x19192b08),
|
| 765 |
+
uvec2(0x19080819, 0x19192b08), uvec2(0x19081908, 0x19192b08), uvec2(0x19190808, 0x19192b08), uvec2(0x19192b2b, 0x19192b08),
|
| 766 |
+
uvec2(0x2b080808, 0x19192b08), uvec2(0x08080819, 0x19192b19), uvec2(0x08081908, 0x19192b19), uvec2(0x08190808, 0x19192b19),
|
| 767 |
+
uvec2(0x19080808, 0x19192b19), uvec2(0x08080808, 0x19192b2b), uvec2(0x08192b19, 0x19192b2b), uvec2(0x2b081919, 0x19192b2b),
|
| 768 |
+
uvec2(0x2b2b2b08, 0x19192b2b), uvec2(0x08080819, 0x192b0808), uvec2(0x08081908, 0x192b0808), uvec2(0x0808192b, 0x192b0808),
|
| 769 |
+
uvec2(0x08190808, 0x192b0808), uvec2(0x0819082b, 0x192b0808), uvec2(0x08191919, 0x192b0808), uvec2(0x08192b08, 0x192b0808),
|
| 770 |
+
uvec2(0x082b0819, 0x192b0808), uvec2(0x082b1908, 0x192b0808), uvec2(0x19080808, 0x192b0808), uvec2(0x19081919, 0x192b0808),
|
| 771 |
+
uvec2(0x19082b08, 0x192b0808), uvec2(0x19190819, 0x192b0808), uvec2(0x19191908, 0x192b0808), uvec2(0x192b0808, 0x192b0808),
|
| 772 |
+
uvec2(0x2b081908, 0x192b0808), uvec2(0x2b190808, 0x192b0808), uvec2(0x08080808, 0x192b0819), uvec2(0x0808082b, 0x192b0819),
|
| 773 |
+
uvec2(0x08081919, 0x192b0819), uvec2(0x08082b08, 0x192b0819), uvec2(0x08190819, 0x192b0819), uvec2(0x08191908, 0x192b0819),
|
| 774 |
+
uvec2(0x082b0808, 0x192b0819), uvec2(0x19080819, 0x192b0819), uvec2(0x19081908, 0x192b0819), uvec2(0x19190808, 0x192b0819),
|
| 775 |
+
uvec2(0x2b080808, 0x192b0819), uvec2(0x2b192b19, 0x192b0819), uvec2(0x08081908, 0x192b082b), uvec2(0x08190808, 0x192b082b),
|
| 776 |
+
uvec2(0x19080808, 0x192b082b), uvec2(0x1919192b, 0x192b082b), uvec2(0x2b2b0819, 0x192b082b), uvec2(0x08080808, 0x192b1908),
|
| 777 |
+
uvec2(0x08081919, 0x192b1908), uvec2(0x08082b08, 0x192b1908), uvec2(0x08190819, 0x192b1908), uvec2(0x08191908, 0x192b1908),
|
| 778 |
+
uvec2(0x082b0808, 0x192b1908), uvec2(0x19080819, 0x192b1908), uvec2(0x19081908, 0x192b1908), uvec2(0x19190808, 0x192b1908),
|
| 779 |
+
uvec2(0x2b080808, 0x192b1908), uvec2(0x08080819, 0x192b1919), uvec2(0x08081908, 0x192b1919), uvec2(0x08190808, 0x192b1919),
|
| 780 |
+
uvec2(0x19080808, 0x192b1919), uvec2(0x19082b2b, 0x192b1919), uvec2(0x192b2b08, 0x192b1919), uvec2(0x2b19082b, 0x192b1919),
|
| 781 |
+
uvec2(0x08080808, 0x192b192b), uvec2(0x2b191908, 0x192b192b), uvec2(0x08080819, 0x192b2b08), uvec2(0x08081908, 0x192b2b08),
|
| 782 |
+
uvec2(0x08190808, 0x192b2b08), uvec2(0x192b1919, 0x192b2b08), uvec2(0x2b192b08, 0x192b2b08), uvec2(0x08080808, 0x192b2b19),
|
| 783 |
+
uvec2(0x082b2b2b, 0x192b2b19), uvec2(0x1908082b, 0x192b2b2b), uvec2(0x2b2b0819, 0x192b2b2b), uvec2(0x08080808, 0x2b080808),
|
| 784 |
+
uvec2(0x0808082b, 0x2b080808), uvec2(0x08081919, 0x2b080808), uvec2(0x08082b08, 0x2b080808), uvec2(0x08190819, 0x2b080808),
|
| 785 |
+
uvec2(0x08191908, 0x2b080808), uvec2(0x08192b19, 0x2b080808), uvec2(0x082b0808, 0x2b080808), uvec2(0x082b1919, 0x2b080808),
|
| 786 |
+
uvec2(0x19080819, 0x2b080808), uvec2(0x19081908, 0x2b080808), uvec2(0x19190808, 0x2b080808), uvec2(0x1919082b, 0x2b080808),
|
| 787 |
+
uvec2(0x19191919, 0x2b080808), uvec2(0x19192b08, 0x2b080808), uvec2(0x192b0819, 0x2b080808), uvec2(0x2b080808, 0x2b080808),
|
| 788 |
+
uvec2(0x2b081919, 0x2b080808), uvec2(0x2b190819, 0x2b080808), uvec2(0x2b191908, 0x2b080808), uvec2(0x08080819, 0x2b080819),
|
| 789 |
+
uvec2(0x08081908, 0x2b080819), uvec2(0x08082b19, 0x2b080819), uvec2(0x08190808, 0x2b080819), uvec2(0x0819082b, 0x2b080819),
|
| 790 |
+
uvec2(0x08191919, 0x2b080819), uvec2(0x08192b08, 0x2b080819), uvec2(0x082b0819, 0x2b080819), uvec2(0x082b1908, 0x2b080819),
|
| 791 |
+
uvec2(0x19080808, 0x2b080819), uvec2(0x1908082b, 0x2b080819), uvec2(0x19081919, 0x2b080819), uvec2(0x19082b08, 0x2b080819),
|
| 792 |
+
uvec2(0x19190819, 0x2b080819), uvec2(0x19191908, 0x2b080819), uvec2(0x2b080819, 0x2b080819), uvec2(0x2b081908, 0x2b080819),
|
| 793 |
+
uvec2(0x2b190808, 0x2b080819), uvec2(0x2b2b2b19, 0x2b080819), uvec2(0x08080808, 0x2b08082b), uvec2(0x08081919, 0x2b08082b),
|
| 794 |
+
uvec2(0x08082b2b, 0x2b08082b), uvec2(0x08190819, 0x2b08082b), uvec2(0x08191908, 0x2b08082b), uvec2(0x19080819, 0x2b08082b),
|
| 795 |
+
uvec2(0x19081908, 0x2b08082b), uvec2(0x19190808, 0x2b08082b), uvec2(0x08080819, 0x2b081908), uvec2(0x08081908, 0x2b081908),
|
| 796 |
+
uvec2(0x0808192b, 0x2b081908), uvec2(0x08082b19, 0x2b081908), uvec2(0x08190808, 0x2b081908), uvec2(0x0819082b, 0x2b081908),
|
| 797 |
+
uvec2(0x08191919, 0x2b081908), uvec2(0x08192b08, 0x2b081908), uvec2(0x082b0819, 0x2b081908), uvec2(0x19080808, 0x2b081908),
|
| 798 |
+
uvec2(0x1908082b, 0x2b081908), uvec2(0x19081919, 0x2b081908), uvec2(0x19082b08, 0x2b081908), uvec2(0x19190819, 0x2b081908),
|
| 799 |
+
uvec2(0x19191908, 0x2b081908), uvec2(0x192b0808, 0x2b081908), uvec2(0x2b080819, 0x2b081908), uvec2(0x2b081908, 0x2b081908),
|
| 800 |
+
uvec2(0x2b190808, 0x2b081908), uvec2(0x08080808, 0x2b081919), uvec2(0x0808082b, 0x2b081919), uvec2(0x08081919, 0x2b081919),
|
| 801 |
+
uvec2(0x08082b08, 0x2b081919), uvec2(0x08190819, 0x2b081919), uvec2(0x08191908, 0x2b081919), uvec2(0x082b0808, 0x2b081919),
|
| 802 |
+
uvec2(0x19080819, 0x2b081919), uvec2(0x19081908, 0x2b081919), uvec2(0x19190808, 0x2b081919), uvec2(0x2b080808, 0x2b081919),
|
| 803 |
+
uvec2(0x2b082b2b, 0x2b081919), uvec2(0x08080819, 0x2b08192b), uvec2(0x08081908, 0x2b08192b), uvec2(0x08190808, 0x2b08192b),
|
| 804 |
+
uvec2(0x082b2b19, 0x2b08192b), uvec2(0x19080808, 0x2b08192b), uvec2(0x08080808, 0x2b082b08), uvec2(0x08081919, 0x2b082b08),
|
| 805 |
+
uvec2(0x08190819, 0x2b082b08), uvec2(0x08191908, 0x2b082b08), uvec2(0x19080819, 0x2b082b08), uvec2(0x19081908, 0x2b082b08),
|
| 806 |
+
uvec2(0x19190808, 0x2b082b08), uvec2(0x2b2b082b, 0x2b082b08), uvec2(0x08080819, 0x2b082b19), uvec2(0x08081908, 0x2b082b19),
|
| 807 |
+
uvec2(0x19080808, 0x2b082b19), uvec2(0x192b1919, 0x2b082b19), uvec2(0x082b082b, 0x2b082b2b), uvec2(0x19192b08, 0x2b082b2b),
|
| 808 |
+
uvec2(0x19192b2b, 0x2b082b2b), uvec2(0x2b08082b, 0x2b082b2b), uvec2(0x2b2b082b, 0x2b082b2b), uvec2(0x08080819, 0x2b190808),
|
| 809 |
+
uvec2(0x08081908, 0x2b190808), uvec2(0x08082b19, 0x2b190808), uvec2(0x08190808, 0x2b190808), uvec2(0x0819082b, 0x2b190808),
|
| 810 |
+
uvec2(0x08191919, 0x2b190808), uvec2(0x08192b08, 0x2b190808), uvec2(0x082b1908, 0x2b190808), uvec2(0x19080808, 0x2b190808),
|
| 811 |
+
uvec2(0x1908082b, 0x2b190808), uvec2(0x19081919, 0x2b190808), uvec2(0x19082b08, 0x2b190808), uvec2(0x19190819, 0x2b190808),
|
| 812 |
+
uvec2(0x19191908, 0x2b190808), uvec2(0x192b0808, 0x2b190808), uvec2(0x2b080819, 0x2b190808), uvec2(0x2b081908, 0x2b190808),
|
| 813 |
+
uvec2(0x2b190808, 0x2b190808), uvec2(0x08080808, 0x2b190819), uvec2(0x08081919, 0x2b190819), uvec2(0x08190819, 0x2b190819),
|
| 814 |
+
uvec2(0x08191908, 0x2b190819), uvec2(0x19080819, 0x2b190819), uvec2(0x19081908, 0x2b190819), uvec2(0x19190808, 0x2b190819),
|
| 815 |
+
uvec2(0x19192b2b, 0x2b190819), uvec2(0x08080819, 0x2b19082b), uvec2(0x08081908, 0x2b19082b), uvec2(0x08190808, 0x2b19082b),
|
| 816 |
+
uvec2(0x19080808, 0x2b19082b), uvec2(0x2b2b192b, 0x2b19082b), uvec2(0x08080808, 0x2b191908), uvec2(0x0808082b, 0x2b191908),
|
| 817 |
+
uvec2(0x08081919, 0x2b191908), uvec2(0x08082b08, 0x2b191908), uvec2(0x08190819, 0x2b191908), uvec2(0x08191908, 0x2b191908),
|
| 818 |
+
uvec2(0x082b0808, 0x2b191908), uvec2(0x19080819, 0x2b191908), uvec2(0x19081908, 0x2b191908), uvec2(0x19190808, 0x2b191908),
|
| 819 |
+
uvec2(0x2b080808, 0x2b191908), uvec2(0x2b19192b, 0x2b191908), uvec2(0x08080819, 0x2b191919), uvec2(0x08081908, 0x2b191919),
|
| 820 |
+
uvec2(0x08190808, 0x2b191919), uvec2(0x19080808, 0x2b191919), uvec2(0x2b192b08, 0x2b191919), uvec2(0x2b2b0819, 0x2b191919),
|
| 821 |
+
uvec2(0x08080808, 0x2b19192b), uvec2(0x1908192b, 0x2b19192b), uvec2(0x192b1908, 0x2b19192b), uvec2(0x08080819, 0x2b192b08),
|
| 822 |
+
uvec2(0x08081908, 0x2b192b08), uvec2(0x08190808, 0x2b192b08), uvec2(0x082b192b, 0x2b192b08), uvec2(0x19080808, 0x2b192b08),
|
| 823 |
+
uvec2(0x2b2b2b19, 0x2b192b08), uvec2(0x08080808, 0x2b192b19), uvec2(0x19082b19, 0x2b192b19), uvec2(0x1919082b, 0x2b192b19),
|
| 824 |
+
uvec2(0x2b190808, 0x2b192b2b), uvec2(0x08080808, 0x2b2b0808), uvec2(0x08081919, 0x2b2b0808), uvec2(0x08082b2b, 0x2b2b0808),
|
| 825 |
+
uvec2(0x08191908, 0x2b2b0808), uvec2(0x082b082b, 0x2b2b0808), uvec2(0x082b2b2b, 0x2b2b0808), uvec2(0x19080819, 0x2b2b0808),
|
| 826 |
+
uvec2(0x19081908, 0x2b2b0808), uvec2(0x19190808, 0x2b2b0808), uvec2(0x2b2b082b, 0x2b2b0808), uvec2(0x2b2b2b2b, 0x2b2b0808),
|
| 827 |
+
uvec2(0x19080808, 0x2b2b0819), uvec2(0x192b1919, 0x2b2b0819), uvec2(0x0808082b, 0x2b2b082b), uvec2(0x08082b2b, 0x2b2b082b),
|
| 828 |
+
uvec2(0x082b082b, 0x2b2b082b), uvec2(0x082b2b08, 0x2b2b082b), uvec2(0x082b2b2b, 0x2b2b082b), uvec2(0x2b08082b, 0x2b2b082b),
|
| 829 |
+
uvec2(0x2b082b08, 0x2b2b082b), uvec2(0x2b082b2b, 0x2b2b082b), uvec2(0x2b2b2b08, 0x2b2b082b), uvec2(0x08080819, 0x2b2b1908),
|
| 830 |
+
uvec2(0x08081908, 0x2b2b1908), uvec2(0x08190808, 0x2b2b1908), uvec2(0x19080808, 0x2b2b1908), uvec2(0x2b082b19, 0x2b2b1908),
|
| 831 |
+
uvec2(0x2b2b1908, 0x2b2b1908), uvec2(0x08080808, 0x2b2b1919), uvec2(0x08192b19, 0x2b2b1919), uvec2(0x19190819, 0x2b2b192b),
|
| 832 |
+
uvec2(0x08082b2b, 0x2b2b2b08), uvec2(0x082b2b08, 0x2b2b2b08), uvec2(0x2b2b082b, 0x2b2b2b08), uvec2(0x19191908, 0x2b2b2b19),
|
| 833 |
+
uvec2(0x2b08192b, 0x2b2b2b19), uvec2(0x08082b08, 0x2b2b2b2b), uvec2(0x08082b2b, 0x2b2b2b2b), uvec2(0x082b0808, 0x2b2b2b2b),
|
| 834 |
+
uvec2(0x082b082b, 0x2b2b2b2b), uvec2(0x082b2b08, 0x2b2b2b2b), uvec2(0x2b082b08, 0x2b2b2b2b), uvec2(0x2b2b2b2b, 0x2b2b2b2b)
|
| 835 |
+
};
|
| 836 |
+
|
| 837 |
+
shared uvec2 iq2s_grid[1024];
|
| 838 |
+
|
| 839 |
+
void init_iq_shmem(uvec3 wgsize)
|
| 840 |
+
{
|
| 841 |
+
// copy the table into shared memory and sync
|
| 842 |
+
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += wgsize.x) {
|
| 843 |
+
iq2s_grid[i] = iq2s_grid_const[i];
|
| 844 |
+
}
|
| 845 |
+
barrier();
|
| 846 |
+
}
|
| 847 |
+
|
| 848 |
+
#define QUANT_K QUANT_K_IQ2_S
|
| 849 |
+
#define QUANT_R QUANT_R_IQ2_S
|
| 850 |
+
#define A_TYPE block_iq2_s
|
| 851 |
+
#endif
|
| 852 |
+
|
| 853 |
+
#define QUANT_K_IQ3_XXS 256
|
| 854 |
+
#define QUANT_R_IQ3_XXS 1
|
| 855 |
+
|
| 856 |
+
struct block_iq3_xxs
|
| 857 |
+
{
|
| 858 |
+
float16_t d;
|
| 859 |
+
uint8_t qs[QUANT_K_IQ3_XXS/4 + QUANT_K_IQ3_XXS/8];
|
| 860 |
+
};
|
| 861 |
+
|
| 862 |
+
struct block_iq3_xxs_packed16
|
| 863 |
+
{
|
| 864 |
+
float16_t d;
|
| 865 |
+
uint16_t qs[QUANT_K_IQ3_XXS/8 + QUANT_K_IQ3_XXS/16];
|
| 866 |
+
};
|
| 867 |
+
|
| 868 |
+
#if defined(DATA_A_IQ3_XXS)
|
| 869 |
+
|
| 870 |
+
const uint32_t iq3xxs_grid_const[256] = {
|
| 871 |
+
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 872 |
+
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
| 873 |
+
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
| 874 |
+
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
| 875 |
+
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
| 876 |
+
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
| 877 |
+
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
| 878 |
+
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
| 879 |
+
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
| 880 |
+
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
| 881 |
+
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
| 882 |
+
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
| 883 |
+
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
| 884 |
+
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
| 885 |
+
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
| 886 |
+
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
| 887 |
+
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
| 888 |
+
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
| 889 |
+
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
| 890 |
+
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
| 891 |
+
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
| 892 |
+
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
| 893 |
+
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
| 894 |
+
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
| 895 |
+
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
| 896 |
+
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
| 897 |
+
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
| 898 |
+
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
| 899 |
+
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
| 900 |
+
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
| 901 |
+
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
| 902 |
+
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
| 903 |
+
};
|
| 904 |
+
|
| 905 |
+
shared uint32_t iq3xxs_grid[256];
|
| 906 |
+
|
| 907 |
+
void init_iq_shmem(uvec3 wgsize)
|
| 908 |
+
{
|
| 909 |
+
// copy the table into shared memory and sync
|
| 910 |
+
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += wgsize.x) {
|
| 911 |
+
iq3xxs_grid[i] = iq3xxs_grid_const[i];
|
| 912 |
+
}
|
| 913 |
+
barrier();
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
#define QUANT_K QUANT_K_IQ3_XXS
|
| 917 |
+
#define QUANT_R QUANT_R_IQ3_XXS
|
| 918 |
+
#define A_TYPE block_iq3_xxs
|
| 919 |
+
#define A_TYPE_PACKED16 block_iq3_xxs_packed16
|
| 920 |
+
#endif
|
| 921 |
+
|
| 922 |
+
#define QUANT_K_IQ3_S 256
|
| 923 |
+
#define QUANT_R_IQ3_S 1
|
| 924 |
+
|
| 925 |
+
struct block_iq3_s
|
| 926 |
+
{
|
| 927 |
+
float16_t d;
|
| 928 |
+
uint8_t qs[QUANT_K_IQ3_S/4];
|
| 929 |
+
uint8_t qh[QUANT_K_IQ3_S/32];
|
| 930 |
+
uint8_t signs[QUANT_K_IQ3_S/8];
|
| 931 |
+
uint8_t scales[QUANT_K_IQ3_S/64];
|
| 932 |
+
};
|
| 933 |
+
|
| 934 |
+
struct block_iq3_s_packed16
|
| 935 |
+
{
|
| 936 |
+
float16_t d;
|
| 937 |
+
uint16_t qs[QUANT_K_IQ3_S/4/2];
|
| 938 |
+
uint16_t qh[QUANT_K_IQ3_S/32/2];
|
| 939 |
+
uint16_t signs[QUANT_K_IQ3_S/8/2];
|
| 940 |
+
uint16_t scales[QUANT_K_IQ3_S/64/2];
|
| 941 |
+
};
|
| 942 |
+
|
| 943 |
+
#if defined(DATA_A_IQ3_S)
|
| 944 |
+
|
| 945 |
+
const uint32_t iq3s_grid_const[512] = {
|
| 946 |
+
0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
|
| 947 |
+
0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
|
| 948 |
+
0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
|
| 949 |
+
0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
|
| 950 |
+
0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
|
| 951 |
+
0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
|
| 952 |
+
0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
|
| 953 |
+
0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
|
| 954 |
+
0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
|
| 955 |
+
0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
|
| 956 |
+
0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
|
| 957 |
+
0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
|
| 958 |
+
0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
|
| 959 |
+
0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
|
| 960 |
+
0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
|
| 961 |
+
0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
|
| 962 |
+
0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
|
| 963 |
+
0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
|
| 964 |
+
0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
|
| 965 |
+
0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
|
| 966 |
+
0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
|
| 967 |
+
0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
|
| 968 |
+
0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
|
| 969 |
+
0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
|
| 970 |
+
0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
|
| 971 |
+
0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
|
| 972 |
+
0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
|
| 973 |
+
0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
|
| 974 |
+
0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
|
| 975 |
+
0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
|
| 976 |
+
0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
|
| 977 |
+
0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
|
| 978 |
+
0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
|
| 979 |
+
0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
|
| 980 |
+
0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
|
| 981 |
+
0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
|
| 982 |
+
0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
|
| 983 |
+
0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
|
| 984 |
+
0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
|
| 985 |
+
0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
|
| 986 |
+
0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
|
| 987 |
+
0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
|
| 988 |
+
0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
|
| 989 |
+
0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
|
| 990 |
+
0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
|
| 991 |
+
0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
|
| 992 |
+
0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
|
| 993 |
+
0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
|
| 994 |
+
0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
|
| 995 |
+
0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
|
| 996 |
+
0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
|
| 997 |
+
0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
|
| 998 |
+
0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
|
| 999 |
+
0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
|
| 1000 |
+
0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
|
| 1001 |
+
0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
|
| 1002 |
+
0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
|
| 1003 |
+
0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
|
| 1004 |
+
0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
|
| 1005 |
+
0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
|
| 1006 |
+
0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
|
| 1007 |
+
0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
|
| 1008 |
+
0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
|
| 1009 |
+
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
| 1010 |
+
};
|
| 1011 |
+
|
| 1012 |
+
shared uint32_t iq3s_grid[512];
|
| 1013 |
+
|
| 1014 |
+
void init_iq_shmem(uvec3 wgsize)
|
| 1015 |
+
{
|
| 1016 |
+
// copy the table into shared memory and sync
|
| 1017 |
+
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += wgsize.x) {
|
| 1018 |
+
iq3s_grid[i] = iq3s_grid_const[i];
|
| 1019 |
+
}
|
| 1020 |
+
barrier();
|
| 1021 |
+
}
|
| 1022 |
+
|
| 1023 |
+
#define QUANT_K QUANT_K_IQ3_S
|
| 1024 |
+
#define QUANT_R QUANT_R_IQ3_S
|
| 1025 |
+
#define A_TYPE block_iq3_s
|
| 1026 |
+
#define A_TYPE_PACKED16 block_iq3_s_packed16
|
| 1027 |
+
#endif
|
| 1028 |
+
|
| 1029 |
#define QUANT_K_IQ4_NL 32
|
| 1030 |
#define QUANT_R_IQ4_NL 2
|
| 1031 |
|
|
|
|
| 1050 |
|
| 1051 |
shared FLOAT_TYPE kvalues_iq4nl[16];
|
| 1052 |
|
| 1053 |
+
void init_iq_shmem(uvec3 wgsize)
|
| 1054 |
{
|
| 1055 |
// copy the table into shared memory and sync
|
| 1056 |
+
for (uint i = gl_LocalInvocationIndex.x; i < kvalues_iq4nl.length(); i += wgsize.x) {
|
| 1057 |
+
kvalues_iq4nl[i] = FLOAT_TYPE(kvalues_iq4nl_const[i]);
|
| 1058 |
}
|
| 1059 |
barrier();
|
| 1060 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
CHANGED
|
@@ -55,6 +55,11 @@ const std::vector<std::string> type_names = {
|
|
| 55 |
"q4_k",
|
| 56 |
"q5_k",
|
| 57 |
"q6_k",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"iq4_nl"
|
| 59 |
};
|
| 60 |
|
|
|
|
| 55 |
"q4_k",
|
| 56 |
"q5_k",
|
| 57 |
"q6_k",
|
| 58 |
+
"iq2_xxs",
|
| 59 |
+
"iq2_xs",
|
| 60 |
+
"iq2_s",
|
| 61 |
+
"iq3_xxs",
|
| 62 |
+
"iq3_s",
|
| 63 |
"iq4_nl"
|
| 64 |
};
|
| 65 |
|