JohannesGaessler commited on
Commit
e446f60
·
1 Parent(s): 66edfb6

CUDA: remove DMMV, consolidate F16 mult mat vec (llama/10318)

Browse files
ggml/CMakeLists.txt CHANGED
@@ -128,14 +128,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
128
 
129
  option(GGML_CUDA "ggml: use CUDA" OFF)
130
  option(GGML_MUSA "ggml: use MUSA" OFF)
131
- option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
132
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
133
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
134
- set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
135
- set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
136
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
137
- set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
138
- "ggml: iters./thread per block for Q2_K/Q6_K")
139
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
140
  "ggml: max. batch size for using peer access")
141
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
 
128
 
129
  option(GGML_CUDA "ggml: use CUDA" OFF)
130
  option(GGML_MUSA "ggml: use MUSA" OFF)
 
131
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
132
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
 
 
133
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
 
 
134
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
135
  "ggml: max. batch size for using peer access")
136
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -16,11 +16,11 @@
16
  #include "ggml-cuda/cpy.cuh"
17
  #include "ggml-cuda/cross-entropy-loss.cuh"
18
  #include "ggml-cuda/diagmask.cuh"
19
- #include "ggml-cuda/dmmv.cuh"
20
  #include "ggml-cuda/fattn.cuh"
21
  #include "ggml-cuda/getrows.cuh"
22
  #include "ggml-cuda/im2col.cuh"
23
  #include "ggml-cuda/mmq.cuh"
 
24
  #include "ggml-cuda/mmvq.cuh"
25
  #include "ggml-cuda/norm.cuh"
26
  #include "ggml-cuda/opt-step-adamw.cuh"
@@ -1020,114 +1020,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
1020
 
1021
  #define MUL_MAT_SRC1_COL_STRIDE 128
1022
 
1023
- static __global__ void mul_mat_p021_f16_f32(
1024
- const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1025
- const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1026
-
1027
- const half * x = (const half *) vx;
1028
-
1029
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1030
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1031
- const int channel_x = channel / (nchannels_y / nchannels_x);
1032
-
1033
- const int nrows_y = ncols_x;
1034
- const int nrows_dst = nrows_x;
1035
- const int row_dst = row_x;
1036
-
1037
- float tmp = 0.0f;
1038
-
1039
- for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1040
- const int col_x = col_x0 + threadIdx.x;
1041
-
1042
- if (col_x >= ncols_x) {
1043
- break;
1044
- }
1045
-
1046
- // x is transposed and permuted
1047
- const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1048
- const float xi = __half2float(x[ix]);
1049
-
1050
- const int row_y = col_x;
1051
-
1052
- // y is not transposed but permuted
1053
- const int iy = channel*nrows_y + row_y;
1054
-
1055
- tmp += xi * y[iy];
1056
- }
1057
-
1058
- // dst is not transposed and not permuted
1059
- const int idst = channel*nrows_dst + row_dst;
1060
-
1061
- // sum up partial sums and write back result
1062
- tmp = warp_reduce_sum(tmp);
1063
-
1064
- if (threadIdx.x == 0) {
1065
- dst[idst] = tmp;
1066
- }
1067
- }
1068
-
1069
- static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1070
- const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1071
- const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1072
-
1073
- const half * x = (const half *) vx;
1074
-
1075
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1076
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1077
- const int channel_x = channel / channel_x_divisor;
1078
-
1079
- const int nrows_y = ncols_x;
1080
- const int nrows_dst = nrows_x;
1081
- const int row_dst = row_x;
1082
-
1083
- const int idst = channel*nrows_dst + row_dst;
1084
-
1085
- float tmp = 0.0f;
1086
-
1087
- for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1088
- const int col_x = col_x0 + threadIdx.x;
1089
-
1090
- if (col_x >= ncols_x) {
1091
- break;
1092
- }
1093
-
1094
- const int row_y = col_x;
1095
-
1096
- const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1097
- const int iy = channel*nrows_y + row_y;
1098
-
1099
- const float xi = __half2float(x[ix]);
1100
-
1101
- tmp += xi * y[iy];
1102
- }
1103
-
1104
- // sum up partial sums and write back result
1105
- tmp = warp_reduce_sum(tmp);
1106
-
1107
- if (threadIdx.x == 0) {
1108
- dst[idst] = tmp;
1109
- }
1110
- }
1111
-
1112
- static void ggml_mul_mat_p021_f16_f32_cuda(
1113
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1114
- const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
1115
-
1116
- const dim3 block_nums(1, nrows_x, nchannels_y);
1117
- const dim3 block_dims(WARP_SIZE, 1, 1);
1118
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
1119
- }
1120
-
1121
- static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1122
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1123
- const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
1124
-
1125
- const dim3 block_nums(1, nrows_x, nchannels_y);
1126
- const dim3 block_dims(WARP_SIZE, 1, 1);
1127
- mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1128
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
1129
- }
1130
-
1131
  static cudaError_t ggml_cuda_cpy_tensor_2d(
1132
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1133
 
@@ -1654,58 +1546,6 @@ static void ggml_cuda_op_mul_mat(
1654
  }
1655
  }
1656
 
1657
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1658
- GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1659
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1660
- GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
1661
- GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
1662
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1663
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1664
-
1665
- const int64_t ne00 = src0->ne[0];
1666
- const int64_t ne01 = src0->ne[1];
1667
- const int64_t ne02 = src0->ne[2];
1668
-
1669
- const int64_t ne12 = src1->ne[2];
1670
-
1671
- cudaStream_t main_stream = ctx.stream();
1672
-
1673
- void * src0_ddq = src0->data;
1674
- float * src1_ddf = (float *) src1->data;
1675
- float * dst_ddf = (float *) dst->data;
1676
-
1677
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1678
- }
1679
-
1680
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1681
- GGML_ASSERT(!ggml_is_transposed(src0));
1682
- GGML_ASSERT(!ggml_is_transposed(src1));
1683
- GGML_ASSERT(!ggml_is_permuted(src0));
1684
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1685
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1686
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1687
-
1688
- const int64_t ne00 = src0->ne[0];
1689
- const int64_t ne01 = src0->ne[1];
1690
- const int64_t ne02 = src0->ne[2];
1691
-
1692
- const int64_t nb01 = src0->nb[1];
1693
- const int64_t nb02 = src0->nb[2];
1694
-
1695
- const int64_t ne12 = src1->ne[2];
1696
-
1697
- cudaStream_t main_stream = ctx.stream();
1698
-
1699
- void * src0_ddq = src0->data;
1700
- float * src1_ddf = (float *) src1->data;
1701
- float * dst_ddf = (float *) dst->data;
1702
-
1703
- const int64_t row_stride_x = nb01 / sizeof(half);
1704
- const int64_t channel_stride_x = nb02 / sizeof(half);
1705
-
1706
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
1707
- }
1708
-
1709
  static __global__ void k_compute_batched_ptrs(
1710
  const half * src0_as_f16, const half * src1_as_f16, char * dst,
1711
  const void ** ptrs_src, void ** ptrs_dst,
@@ -1879,21 +1719,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1879
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1880
  const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1881
 
1882
- bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
1883
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1884
- && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
1885
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1886
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1887
  && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1888
- bool use_mul_mat_q = ggml_is_quantized(src0->type)
1889
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1890
 
1891
- // if mmvq is available it's a better choice than dmmv:
1892
- #ifndef GGML_CUDA_FORCE_DMMV
1893
- use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1894
- #endif // GGML_CUDA_FORCE_DMMV
1895
-
1896
- bool any_gpus_with_slow_fp16 = false;
1897
 
1898
  if (split) {
1899
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1904,14 +1740,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1904
  continue;
1905
  }
1906
 
1907
- const int cc = ggml_cuda_info().devices[id].cc;
1908
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1909
- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
 
1910
  }
1911
  } else {
1912
- const int cc = ggml_cuda_info().devices[ctx.device].cc;
1913
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1914
- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
 
1915
  }
1916
 
1917
  // debug helpers
@@ -1922,18 +1760,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1922
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1923
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1924
 
1925
- if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
1926
- // FP32 precision KQ single-batch for batch size 1 without FlashAttention
1927
- ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
1928
- } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1929
- // FP32 precision KQV single-batch for batch size 1 without FlashAttention
1930
- ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1931
  } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
1932
  && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1933
  // KQ + KQV multi-batch without FlashAttention
1934
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1935
- } else if (use_dequantize_mul_mat_vec) {
1936
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
1937
  } else if (use_mul_mat_vec_q) {
1938
  ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1939
  } else if (use_mul_mat_q) {
 
16
  #include "ggml-cuda/cpy.cuh"
17
  #include "ggml-cuda/cross-entropy-loss.cuh"
18
  #include "ggml-cuda/diagmask.cuh"
 
19
  #include "ggml-cuda/fattn.cuh"
20
  #include "ggml-cuda/getrows.cuh"
21
  #include "ggml-cuda/im2col.cuh"
22
  #include "ggml-cuda/mmq.cuh"
23
+ #include "ggml-cuda/mmv.cuh"
24
  #include "ggml-cuda/mmvq.cuh"
25
  #include "ggml-cuda/norm.cuh"
26
  #include "ggml-cuda/opt-step-adamw.cuh"
 
1020
 
1021
  #define MUL_MAT_SRC1_COL_STRIDE 128
1022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  static cudaError_t ggml_cuda_cpy_tensor_2d(
1024
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1025
 
 
1546
  }
1547
  }
1548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1549
  static __global__ void k_compute_batched_ptrs(
1550
  const half * src0_as_f16, const half * src1_as_f16, char * dst,
1551
  const void ** ptrs_src, void ** ptrs_dst,
 
1719
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1720
  const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1721
 
1722
+ bool use_mul_mat_vec = src0->type == GGML_TYPE_F16
1723
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1724
+ && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1725
+ bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1726
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1727
  && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1728
+ bool use_mul_mat_q = ggml_is_quantized(src0->type)
1729
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1730
 
1731
+ bool any_gpus_with_slow_fp16 = false;
1732
+ bool any_gpus_without_fp16_mma = false;
 
 
 
 
1733
 
1734
  if (split) {
1735
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
 
1740
  continue;
1741
  }
1742
 
1743
+ const int cc = ggml_cuda_info().devices[id].cc;
1744
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1745
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1746
+ any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
1747
  }
1748
  } else {
1749
+ const int cc = ggml_cuda_info().devices[ctx.device].cc;
1750
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1751
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1752
+ any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
1753
  }
1754
 
1755
  // debug helpers
 
1760
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1761
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1762
 
1763
+ if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1764
+ ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
 
 
 
 
1765
  } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
1766
  && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1767
  // KQ + KQV multi-batch without FlashAttention
1768
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1769
+ } else if (use_mul_mat_vec) {
1770
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
1771
  } else if (use_mul_mat_vec_q) {
1772
  ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1773
  } else if (use_mul_mat_q) {
ggml/src/ggml-cuda/ggml/CMakeLists.txt CHANGED
@@ -54,21 +54,12 @@ if (CUDAToolkit_FOUND)
54
  target_link_libraries(ggml-cuda PRIVATE ggml-base)
55
  target_include_directories(ggml-cuda PRIVATE . ..)
56
 
57
- # TODO: change the definitions to this target only
58
-
59
- add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
60
- add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
61
- add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
62
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
63
 
64
  if (GGML_CUDA_GRAPHS)
65
  add_compile_definitions(GGML_CUDA_USE_GRAPHS)
66
  endif()
67
 
68
- if (GGML_CUDA_FORCE_DMMV)
69
- add_compile_definitions(GGML_CUDA_FORCE_DMMV)
70
- endif()
71
-
72
  if (GGML_CUDA_FORCE_MMQ)
73
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
74
  endif()
@@ -81,10 +72,6 @@ if (CUDAToolkit_FOUND)
81
  add_compile_definitions(GGML_CUDA_NO_VMM)
82
  endif()
83
 
84
- if (DEFINED GGML_CUDA_DMMV_Y)
85
- add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
86
- endif()
87
-
88
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
89
  add_compile_definitions(GGML_CUDA_F16)
90
  endif()
 
54
  target_link_libraries(ggml-cuda PRIVATE ggml-base)
55
  target_include_directories(ggml-cuda PRIVATE . ..)
56
 
 
 
 
 
 
57
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
58
 
59
  if (GGML_CUDA_GRAPHS)
60
  add_compile_definitions(GGML_CUDA_USE_GRAPHS)
61
  endif()
62
 
 
 
 
 
63
  if (GGML_CUDA_FORCE_MMQ)
64
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
65
  endif()
 
72
  add_compile_definitions(GGML_CUDA_NO_VMM)
73
  endif()
74
 
 
 
 
 
75
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
76
  add_compile_definitions(GGML_CUDA_F16)
77
  endif()
ggml/src/ggml-cuda/mmv.cu ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+ #include "mmv.cuh"
3
+
4
+ template <typename type_acc, int block_size>
5
+ static __global__ void mul_mat_vec(
6
+ const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
7
+ const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
8
+ const int64_t row = blockIdx.x;
9
+ const int64_t channel = blockIdx.z;
10
+ const int tid = threadIdx.x;
11
+
12
+ x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
13
+ y += channel *stride_channel_y;
14
+ dst += channel *stride_channel_dst;
15
+
16
+ const half2 * x2 = (const half2 *) x;
17
+ const float2 * y2 = (const float2 *) y;
18
+
19
+ extern __shared__ char data_mmv[];
20
+ float * buf_iw = (float *) data_mmv;
21
+
22
+ if (block_size > WARP_SIZE) {
23
+ if (tid < WARP_SIZE) {
24
+ buf_iw[tid] = 0.0f;
25
+ }
26
+ __syncthreads();
27
+ }
28
+
29
+ float sumf;
30
+
31
+ if (std::is_same<type_acc, float>::value) {
32
+ sumf = 0.0f;
33
+
34
+ for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
35
+ const float2 tmpx = __half22float2(x2[col2]);
36
+ const float2 tmpy = y2[col2];
37
+ sumf += tmpx.x * tmpy.x;
38
+ sumf += tmpx.y * tmpy.y;
39
+ }
40
+ } else {
41
+ #ifdef FP16_AVAILABLE
42
+ half2 sumh2 = make_half2(0.0f, 0.0f);
43
+
44
+ for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
45
+ const float2 tmp = y2[col2];
46
+ sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
47
+ }
48
+
49
+ sumf = __low2float(sumh2) + __high2float(sumh2);
50
+ #else
51
+ NO_DEVICE_CODE;
52
+ #endif // FP16_AVAILABLE
53
+ }
54
+
55
+ sumf = warp_reduce_sum(sumf);
56
+
57
+ if (block_size > WARP_SIZE) {
58
+ buf_iw[tid/WARP_SIZE] = sumf;
59
+ __syncthreads();
60
+ if (tid > WARP_SIZE) {
61
+ return;
62
+ }
63
+ sumf = buf_iw[tid];
64
+ sumf = warp_reduce_sum(sumf);
65
+ }
66
+
67
+ if (tid != 0) {
68
+ return;
69
+ }
70
+
71
+ dst[row] = sumf;
72
+ }
73
+
74
+ template <typename type_acc>
75
+ static void launch_mul_mat_vec_cuda(
76
+ const half * x, const float * y, float * dst,
77
+ const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
78
+ const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
79
+ cudaStream_t stream) {
80
+ GGML_ASSERT(ncols % 2 == 0);
81
+ GGML_ASSERT(stride_row % 2 == 0);
82
+ GGML_ASSERT(nchannels_y % nchannels_x == 0);
83
+ const int64_t channel_ratio = nchannels_y / nchannels_x;
84
+
85
+ int64_t block_size_best = WARP_SIZE;
86
+ int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
87
+ for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
88
+ const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
89
+ if (niter < niter_best) {
90
+ niter_best = niter;
91
+ block_size_best = block_size;
92
+ }
93
+ }
94
+
95
+ const int smem = WARP_SIZE*sizeof(float);
96
+ const dim3 block_nums(nrows, 1, nchannels_y);
97
+ const dim3 block_dims(block_size_best, 1, 1);
98
+ switch (block_size_best) {
99
+ case 32: {
100
+ mul_mat_vec<type_acc, 32><<<block_nums, block_dims, smem, stream>>>
101
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
102
+ } break;
103
+ case 64: {
104
+ mul_mat_vec<type_acc, 64><<<block_nums, block_dims, smem, stream>>>
105
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
106
+ } break;
107
+ case 96: {
108
+ mul_mat_vec<type_acc, 96><<<block_nums, block_dims, smem, stream>>>
109
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
110
+ } break;
111
+ case 128: {
112
+ mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
113
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
114
+ } break;
115
+ case 160: {
116
+ mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
117
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
118
+ } break;
119
+ case 192: {
120
+ mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
121
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
122
+ } break;
123
+ case 224: {
124
+ mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
125
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
126
+ } break;
127
+ case 256: {
128
+ mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
129
+ (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
130
+ } break;
131
+ default: {
132
+ GGML_ABORT("fatal error");
133
+ } break;
134
+ }
135
+ }
136
+
137
+ static void mul_mat_vec_cuda(
138
+ const half * x, const float * y, float * dst,
139
+ const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
140
+ const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
141
+ enum ggml_prec prec, cudaStream_t stream) {
142
+ switch (prec) {
143
+ case GGML_PREC_DEFAULT: {
144
+ launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
145
+ stride_channel_x, stride_channel_y, stride_channel_dst, stream);
146
+ } break;
147
+ case GGML_PREC_F32: {
148
+ launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
149
+ stride_channel_x, stride_channel_y, stride_channel_dst, stream);
150
+ } break;
151
+ }
152
+ }
153
+
154
+ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
155
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
156
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
157
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
158
+
159
+ const int64_t ne00 = src0->ne[0];
160
+ const int64_t ne01 = src0->ne[1];
161
+
162
+ GGML_ASSERT(src1->ne[1] == 1);
163
+
164
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
165
+ const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
166
+
167
+ const half * src0_d = (const half *) src0->data;
168
+ const float * src1_d = (const float *) src1->data;
169
+ float * dst_d = (float *) dst->data;
170
+
171
+ const int64_t ne02 = src0->ne[2];
172
+ const int64_t ne12 = src1->ne[2];
173
+ GGML_ASSERT(dst->ne[2] == ne12);
174
+
175
+ GGML_ASSERT(src0->ne[3] == 1);
176
+ GGML_ASSERT(src1->ne[3] == 1);
177
+ GGML_ASSERT( dst->ne[3] == 1);
178
+
179
+ const int64_t stride_row = src0->nb[1] / ggml_type_size(src0->type);
180
+ const int64_t channel_stride_x = src0->nb[2] / ggml_type_size(src0->type);
181
+ const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
182
+ const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
183
+
184
+ mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
185
+ }
186
+
187
+ void ggml_cuda_op_mul_mat_vec(
188
+ ggml_backend_cuda_context & ctx,
189
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
190
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
191
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
192
+
193
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
194
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
195
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
196
+
197
+ const int64_t ne00 = src0->ne[0];
198
+ const int64_t row_diff = row_high - row_low;
199
+
200
+ GGML_ASSERT(src1_ncols == 1);
201
+
202
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
203
+ const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
204
+
205
+
206
+ // ggml_cuda_op provides single, contiguous matrices
207
+ const int64_t stride_row = ne00;
208
+ const int64_t nchannels_x = 1;
209
+ const int64_t nchannels_y = 1;
210
+ const int64_t channel_stride_x = 0;
211
+ const int64_t channel_stride_y = 0;
212
+ const int64_t channel_stride_dst = 0;
213
+
214
+ mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
215
+ nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
216
+
217
+ GGML_UNUSED(ctx);
218
+ GGML_UNUSED(src1);
219
+ GGML_UNUSED(dst);
220
+ GGML_UNUSED(src1_ddq_i);
221
+ GGML_UNUSED(src1_ncols);
222
+ GGML_UNUSED(src1_padded_row_size);
223
+ }
ggml/src/ggml-cuda/mmv.cuh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
4
+ #define MMV_MAX_ROWS 512
5
+
6
+ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
7
+
8
+ void ggml_cuda_op_mul_mat_vec(
9
+ ggml_backend_cuda_context & ctx,
10
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
11
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
12
+ const int64_t src1_padded_row_size, cudaStream_t stream);
ggml/src/ggml-hip/CMakeLists.txt CHANGED
@@ -75,18 +75,11 @@ target_include_directories(ggml-hip PRIVATE . ..)
75
  target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
76
 
77
  add_compile_definitions(GGML_USE_HIP)
78
- add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
79
- add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
80
- add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
81
 
82
  if (GGML_HIP_UMA)
83
  add_compile_definitions(GGML_HIP_UMA)
84
  endif()
85
 
86
- if (GGML_CUDA_FORCE_DMMV)
87
- add_compile_definitions(GGML_CUDA_FORCE_DMMV)
88
- endif()
89
-
90
  if (GGML_CUDA_FORCE_MMQ)
91
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
92
  endif()
 
75
  target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
76
 
77
  add_compile_definitions(GGML_USE_HIP)
 
 
 
78
 
79
  if (GGML_HIP_UMA)
80
  add_compile_definitions(GGML_HIP_UMA)
81
  endif()
82
 
 
 
 
 
83
  if (GGML_CUDA_FORCE_MMQ)
84
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
85
  endif()
ggml/src/ggml-musa/ggml/CMakeLists.txt CHANGED
@@ -58,19 +58,12 @@ if (MUSAToolkit_FOUND)
58
  target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
59
 
60
  add_compile_definitions(GGML_USE_MUSA)
61
- add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
62
- add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
63
- add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
64
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
65
 
66
  if (GGML_CUDA_GRAPHS)
67
  add_compile_definitions(GGML_CUDA_USE_GRAPHS)
68
  endif()
69
 
70
- if (GGML_CUDA_FORCE_DMMV)
71
- add_compile_definitions(GGML_CUDA_FORCE_DMMV)
72
- endif()
73
-
74
  if (GGML_CUDA_FORCE_MMQ)
75
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
76
  endif()
@@ -83,10 +76,6 @@ if (MUSAToolkit_FOUND)
83
  add_compile_definitions(GGML_CUDA_NO_VMM)
84
  endif()
85
 
86
- if (DEFINED GGML_CUDA_DMMV_Y)
87
- add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
88
- endif()
89
-
90
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
91
  add_compile_definitions(GGML_CUDA_F16)
92
  endif()
 
58
  target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
59
 
60
  add_compile_definitions(GGML_USE_MUSA)
 
 
 
61
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
62
 
63
  if (GGML_CUDA_GRAPHS)
64
  add_compile_definitions(GGML_CUDA_USE_GRAPHS)
65
  endif()
66
 
 
 
 
 
67
  if (GGML_CUDA_FORCE_MMQ)
68
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
69
  endif()
 
76
  add_compile_definitions(GGML_CUDA_NO_VMM)
77
  endif()
78
 
 
 
 
 
79
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
80
  add_compile_definitions(GGML_CUDA_F16)
81
  endif()