JohannesGaessler commited on
Commit
e7b20b1
·
1 Parent(s): 925eb7a

CUDA: deduplicate mmq code (llama/7397)

Browse files
Files changed (1) hide show
  1. ggml-cuda/mmq.cu +271 -966
ggml-cuda/mmq.cu CHANGED
@@ -9,6 +9,135 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
9
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
10
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
11
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
14
  GGML_UNUSED(x_qh);
@@ -943,25 +1072,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
943
  return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
944
  }
945
 
946
- #define MMQ_X_Q4_0_RDNA2 64
947
- #define MMQ_Y_Q4_0_RDNA2 128
948
- #define NWARPS_Q4_0_RDNA2 8
949
- #define MMQ_X_Q4_0_RDNA1 64
950
- #define MMQ_Y_Q4_0_RDNA1 64
951
- #define NWARPS_Q4_0_RDNA1 8
952
- #if defined(CUDA_USE_TENSOR_CORES)
953
- #define MMQ_X_Q4_0_AMPERE 4
954
- #define MMQ_Y_Q4_0_AMPERE 32
955
- #define NWARPS_Q4_0_AMPERE 4
956
- #else
957
- #define MMQ_X_Q4_0_AMPERE 64
958
- #define MMQ_Y_Q4_0_AMPERE 128
959
- #define NWARPS_Q4_0_AMPERE 4
960
- #endif
961
- #define MMQ_X_Q4_0_PASCAL 64
962
- #define MMQ_Y_Q4_0_PASCAL 64
963
- #define NWARPS_Q4_0_PASCAL 8
964
-
965
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
966
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
967
  static __device__ __forceinline__ void mul_mat_q(
@@ -1072,1107 +1182,265 @@ static __device__ __forceinline__ void mul_mat_q(
1072
  }
1073
  }
1074
 
1075
- template <bool need_check> static __global__ void
1076
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1077
- #if defined(RDNA3) || defined(RDNA2)
1078
- __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
1079
- #endif // defined(RDNA3) || defined(RDNA2)
1080
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1081
- mul_mat_q4_0(
1082
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1083
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1084
 
1085
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 
1086
  #if defined(RDNA3) || defined(RDNA2)
1087
- const int mmq_x = MMQ_X_Q4_0_RDNA2;
1088
- const int mmq_y = MMQ_Y_Q4_0_RDNA2;
1089
- const int nwarps = NWARPS_Q4_0_RDNA2;
1090
  #else
1091
- const int mmq_x = MMQ_X_Q4_0_RDNA1;
1092
- const int mmq_y = MMQ_Y_Q4_0_RDNA1;
1093
- const int nwarps = NWARPS_Q4_0_RDNA1;
1094
  #endif // defined(RDNA3) || defined(RDNA2)
1095
 
1096
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
1097
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1098
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1099
 
1100
- #elif __CUDA_ARCH__ >= CC_VOLTA
1101
- const int mmq_x = MMQ_X_Q4_0_AMPERE;
1102
- const int mmq_y = MMQ_Y_Q4_0_AMPERE;
1103
- const int nwarps = NWARPS_Q4_0_AMPERE;
 
1104
 
1105
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
1106
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1107
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1108
 
1109
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1110
- const int mmq_x = MMQ_X_Q4_0_PASCAL;
1111
- const int mmq_y = MMQ_Y_Q4_0_PASCAL;
1112
- const int nwarps = NWARPS_Q4_0_PASCAL;
 
 
 
 
 
 
 
 
1113
 
1114
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
1115
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1116
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1117
  #else
1118
  GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
1119
  NO_DEVICE_CODE;
1120
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1121
  }
1122
 
1123
- #define MMQ_X_Q4_1_RDNA2 64
1124
- #define MMQ_Y_Q4_1_RDNA2 128
1125
- #define NWARPS_Q4_1_RDNA2 8
1126
- #define MMQ_X_Q4_1_RDNA1 64
1127
- #define MMQ_Y_Q4_1_RDNA1 64
1128
- #define NWARPS_Q4_1_RDNA1 8
1129
- #if defined(CUDA_USE_TENSOR_CORES)
1130
- #define MMQ_X_Q4_1_AMPERE 4
1131
- #define MMQ_Y_Q4_1_AMPERE 32
1132
- #define NWARPS_Q4_1_AMPERE 4
1133
- #else
1134
- #define MMQ_X_Q4_1_AMPERE 64
1135
- #define MMQ_Y_Q4_1_AMPERE 128
1136
- #define NWARPS_Q4_1_AMPERE 4
1137
- #endif
1138
- #define MMQ_X_Q4_1_PASCAL 64
1139
- #define MMQ_Y_Q4_1_PASCAL 64
1140
- #define NWARPS_Q4_1_PASCAL 8
1141
-
1142
  template <bool need_check> static __global__ void
1143
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1144
  #if defined(RDNA3) || defined(RDNA2)
1145
- __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
1146
  #endif // defined(RDNA3) || defined(RDNA2)
1147
  #elif __CUDA_ARCH__ < CC_VOLTA
1148
- __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
1149
  #endif // __CUDA_ARCH__ < CC_VOLTA
1150
  mul_mat_q4_1(
1151
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1152
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1153
 
1154
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1155
- #if defined(RDNA3) || defined(RDNA2)
1156
- const int mmq_x = MMQ_X_Q4_1_RDNA2;
1157
- const int mmq_y = MMQ_Y_Q4_1_RDNA2;
1158
- const int nwarps = NWARPS_Q4_1_RDNA2;
1159
- #else
1160
- const int mmq_x = MMQ_X_Q4_1_RDNA1;
1161
- const int mmq_y = MMQ_Y_Q4_1_RDNA1;
1162
- const int nwarps = NWARPS_Q4_1_RDNA1;
1163
- #endif // defined(RDNA3) || defined(RDNA2)
1164
-
1165
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
1166
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1167
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1168
-
1169
- #elif __CUDA_ARCH__ >= CC_VOLTA
1170
- const int mmq_x = MMQ_X_Q4_1_AMPERE;
1171
- const int mmq_y = MMQ_Y_Q4_1_AMPERE;
1172
- const int nwarps = NWARPS_Q4_1_AMPERE;
1173
-
1174
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
1175
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1176
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1177
-
1178
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1179
- const int mmq_x = MMQ_X_Q4_1_PASCAL;
1180
- const int mmq_y = MMQ_Y_Q4_1_PASCAL;
1181
- const int nwarps = NWARPS_Q4_1_PASCAL;
1182
 
1183
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
1184
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1185
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1186
  #else
1187
  GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
1188
  NO_DEVICE_CODE;
1189
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1190
  }
1191
 
1192
- #define MMQ_X_Q5_0_RDNA2 64
1193
- #define MMQ_Y_Q5_0_RDNA2 128
1194
- #define NWARPS_Q5_0_RDNA2 8
1195
- #define MMQ_X_Q5_0_RDNA1 64
1196
- #define MMQ_Y_Q5_0_RDNA1 64
1197
- #define NWARPS_Q5_0_RDNA1 8
1198
- #if defined(CUDA_USE_TENSOR_CORES)
1199
- #define MMQ_X_Q5_0_AMPERE 4
1200
- #define MMQ_Y_Q5_0_AMPERE 32
1201
- #define NWARPS_Q5_0_AMPERE 4
1202
- #else
1203
- #define MMQ_X_Q5_0_AMPERE 128
1204
- #define MMQ_Y_Q5_0_AMPERE 64
1205
- #define NWARPS_Q5_0_AMPERE 4
1206
- #endif
1207
- #define MMQ_X_Q5_0_PASCAL 64
1208
- #define MMQ_Y_Q5_0_PASCAL 64
1209
- #define NWARPS_Q5_0_PASCAL 8
1210
-
1211
  template <bool need_check> static __global__ void
1212
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1213
  #if defined(RDNA3) || defined(RDNA2)
1214
- __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
1215
  #endif // defined(RDNA3) || defined(RDNA2)
1216
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1217
  mul_mat_q5_0(
1218
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1219
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1220
 
1221
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1222
- #if defined(RDNA3) || defined(RDNA2)
1223
- const int mmq_x = MMQ_X_Q5_0_RDNA2;
1224
- const int mmq_y = MMQ_Y_Q5_0_RDNA2;
1225
- const int nwarps = NWARPS_Q5_0_RDNA2;
1226
- #else
1227
- const int mmq_x = MMQ_X_Q5_0_RDNA1;
1228
- const int mmq_y = MMQ_Y_Q5_0_RDNA1;
1229
- const int nwarps = NWARPS_Q5_0_RDNA1;
1230
- #endif // defined(RDNA3) || defined(RDNA2)
1231
 
1232
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
1233
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1234
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1235
-
1236
- #elif __CUDA_ARCH__ >= CC_VOLTA
1237
- const int mmq_x = MMQ_X_Q5_0_AMPERE;
1238
- const int mmq_y = MMQ_Y_Q5_0_AMPERE;
1239
- const int nwarps = NWARPS_Q5_0_AMPERE;
1240
-
1241
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
1242
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1243
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1244
-
1245
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1246
- const int mmq_x = MMQ_X_Q5_0_PASCAL;
1247
- const int mmq_y = MMQ_Y_Q5_0_PASCAL;
1248
- const int nwarps = NWARPS_Q5_0_PASCAL;
1249
-
1250
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
1251
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1252
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1253
  #else
1254
  GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
1255
  NO_DEVICE_CODE;
1256
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1257
  }
1258
 
1259
- #define MMQ_X_Q5_1_RDNA2 64
1260
- #define MMQ_Y_Q5_1_RDNA2 128
1261
- #define NWARPS_Q5_1_RDNA2 8
1262
- #define MMQ_X_Q5_1_RDNA1 64
1263
- #define MMQ_Y_Q5_1_RDNA1 64
1264
- #define NWARPS_Q5_1_RDNA1 8
1265
- #if defined(CUDA_USE_TENSOR_CORES)
1266
- #define MMQ_X_Q5_1_AMPERE 4
1267
- #define MMQ_Y_Q5_1_AMPERE 32
1268
- #define NWARPS_Q5_1_AMPERE 4
1269
- #else
1270
- #define MMQ_X_Q5_1_AMPERE 128
1271
- #define MMQ_Y_Q5_1_AMPERE 64
1272
- #define NWARPS_Q5_1_AMPERE 4
1273
- #endif
1274
- #define MMQ_X_Q5_1_PASCAL 64
1275
- #define MMQ_Y_Q5_1_PASCAL 64
1276
- #define NWARPS_Q5_1_PASCAL 8
1277
-
1278
  template <bool need_check> static __global__ void
1279
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1280
  #if defined(RDNA3) || defined(RDNA2)
1281
- __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
1282
  #endif // defined(RDNA3) || defined(RDNA2)
1283
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1284
  mul_mat_q5_1(
1285
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1286
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1287
 
1288
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1289
- #if defined(RDNA3) || defined(RDNA2)
1290
- const int mmq_x = MMQ_X_Q5_1_RDNA2;
1291
- const int mmq_y = MMQ_Y_Q5_1_RDNA2;
1292
- const int nwarps = NWARPS_Q5_1_RDNA2;
1293
- #else
1294
- const int mmq_x = MMQ_X_Q5_1_RDNA1;
1295
- const int mmq_y = MMQ_Y_Q5_1_RDNA1;
1296
- const int nwarps = NWARPS_Q5_1_RDNA1;
1297
- #endif // defined(RDNA3) || defined(RDNA2)
1298
-
1299
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
1300
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1301
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1302
-
1303
- #elif __CUDA_ARCH__ >= CC_VOLTA
1304
- const int mmq_x = MMQ_X_Q5_1_AMPERE;
1305
- const int mmq_y = MMQ_Y_Q5_1_AMPERE;
1306
- const int nwarps = NWARPS_Q5_1_AMPERE;
1307
 
1308
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
1309
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1310
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1311
-
1312
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1313
- const int mmq_x = MMQ_X_Q5_1_PASCAL;
1314
- const int mmq_y = MMQ_Y_Q5_1_PASCAL;
1315
- const int nwarps = NWARPS_Q5_1_PASCAL;
1316
-
1317
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
1318
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1319
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1320
  #else
1321
  GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
1322
  NO_DEVICE_CODE;
1323
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1324
  }
1325
 
1326
- #define MMQ_X_Q8_0_RDNA2 64
1327
- #define MMQ_Y_Q8_0_RDNA2 128
1328
- #define NWARPS_Q8_0_RDNA2 8
1329
- #define MMQ_X_Q8_0_RDNA1 64
1330
- #define MMQ_Y_Q8_0_RDNA1 64
1331
- #define NWARPS_Q8_0_RDNA1 8
1332
- #if defined(CUDA_USE_TENSOR_CORES)
1333
- #define MMQ_X_Q8_0_AMPERE 4
1334
- #define MMQ_Y_Q8_0_AMPERE 32
1335
- #define NWARPS_Q8_0_AMPERE 4
1336
- #else
1337
- #define MMQ_X_Q8_0_AMPERE 128
1338
- #define MMQ_Y_Q8_0_AMPERE 64
1339
- #define NWARPS_Q8_0_AMPERE 4
1340
- #endif
1341
- #define MMQ_X_Q8_0_PASCAL 64
1342
- #define MMQ_Y_Q8_0_PASCAL 64
1343
- #define NWARPS_Q8_0_PASCAL 8
1344
-
1345
  template <bool need_check> static __global__ void
1346
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1347
  #if defined(RDNA3) || defined(RDNA2)
1348
- __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
1349
  #endif // defined(RDNA3) || defined(RDNA2)
1350
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1351
  mul_mat_q8_0(
1352
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1353
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1354
 
1355
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1356
- #if defined(RDNA3) || defined(RDNA2)
1357
- const int mmq_x = MMQ_X_Q8_0_RDNA2;
1358
- const int mmq_y = MMQ_Y_Q8_0_RDNA2;
1359
- const int nwarps = NWARPS_Q8_0_RDNA2;
1360
- #else
1361
- const int mmq_x = MMQ_X_Q8_0_RDNA1;
1362
- const int mmq_y = MMQ_Y_Q8_0_RDNA1;
1363
- const int nwarps = NWARPS_Q8_0_RDNA1;
1364
- #endif // defined(RDNA3) || defined(RDNA2)
1365
-
1366
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
1367
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1368
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1369
-
1370
- #elif __CUDA_ARCH__ >= CC_VOLTA
1371
- const int mmq_x = MMQ_X_Q8_0_AMPERE;
1372
- const int mmq_y = MMQ_Y_Q8_0_AMPERE;
1373
- const int nwarps = NWARPS_Q8_0_AMPERE;
1374
-
1375
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
1376
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1377
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1378
-
1379
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1380
- const int mmq_x = MMQ_X_Q8_0_PASCAL;
1381
- const int mmq_y = MMQ_Y_Q8_0_PASCAL;
1382
- const int nwarps = NWARPS_Q8_0_PASCAL;
1383
 
1384
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
1385
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1386
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1387
  #else
1388
  GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
1389
  NO_DEVICE_CODE;
1390
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1391
  }
1392
 
1393
- #define MMQ_X_Q2_K_RDNA2 64
1394
- #define MMQ_Y_Q2_K_RDNA2 128
1395
- #define NWARPS_Q2_K_RDNA2 8
1396
- #define MMQ_X_Q2_K_RDNA1 128
1397
- #define MMQ_Y_Q2_K_RDNA1 32
1398
- #define NWARPS_Q2_K_RDNA1 8
1399
- #if defined(CUDA_USE_TENSOR_CORES)
1400
- #define MMQ_X_Q2_K_AMPERE 4
1401
- #define MMQ_Y_Q2_K_AMPERE 32
1402
- #define NWARPS_Q2_K_AMPERE 4
1403
- #else
1404
- #define MMQ_X_Q2_K_AMPERE 64
1405
- #define MMQ_Y_Q2_K_AMPERE 128
1406
- #define NWARPS_Q2_K_AMPERE 4
1407
- #endif
1408
- #define MMQ_X_Q2_K_PASCAL 64
1409
- #define MMQ_Y_Q2_K_PASCAL 64
1410
- #define NWARPS_Q2_K_PASCAL 8
1411
-
1412
  template <bool need_check> static __global__ void
1413
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1414
  #if defined(RDNA3) || defined(RDNA2)
1415
- __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
1416
  #endif // defined(RDNA3) || defined(RDNA2)
1417
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1418
  mul_mat_q2_K(
1419
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1420
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1421
 
1422
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1423
- #if defined(RDNA3) || defined(RDNA2)
1424
- const int mmq_x = MMQ_X_Q2_K_RDNA2;
1425
- const int mmq_y = MMQ_Y_Q2_K_RDNA2;
1426
- const int nwarps = NWARPS_Q2_K_RDNA2;
1427
- #else
1428
- const int mmq_x = MMQ_X_Q2_K_RDNA1;
1429
- const int mmq_y = MMQ_Y_Q2_K_RDNA1;
1430
- const int nwarps = NWARPS_Q2_K_RDNA1;
1431
- #endif // defined(RDNA3) || defined(RDNA2)
1432
 
1433
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
1434
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1435
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1436
-
1437
- #elif __CUDA_ARCH__ >= CC_VOLTA
1438
- const int mmq_x = MMQ_X_Q2_K_AMPERE;
1439
- const int mmq_y = MMQ_Y_Q2_K_AMPERE;
1440
- const int nwarps = NWARPS_Q2_K_AMPERE;
1441
-
1442
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
1443
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1444
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1445
-
1446
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1447
- const int mmq_x = MMQ_X_Q2_K_PASCAL;
1448
- const int mmq_y = MMQ_Y_Q2_K_PASCAL;
1449
- const int nwarps = NWARPS_Q2_K_PASCAL;
1450
-
1451
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
1452
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1453
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1454
  #else
1455
  GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
1456
  NO_DEVICE_CODE;
1457
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1458
  }
1459
 
1460
- #define MMQ_X_Q3_K_RDNA2 128
1461
- #define MMQ_Y_Q3_K_RDNA2 64
1462
- #define NWARPS_Q3_K_RDNA2 8
1463
- #define MMQ_X_Q3_K_RDNA1 32
1464
- #define MMQ_Y_Q3_K_RDNA1 128
1465
- #define NWARPS_Q3_K_RDNA1 8
1466
- #if defined(CUDA_USE_TENSOR_CORES)
1467
- #define MMQ_X_Q3_K_AMPERE 4
1468
- #define MMQ_Y_Q3_K_AMPERE 32
1469
- #define NWARPS_Q3_K_AMPERE 4
1470
- #else
1471
- #define MMQ_X_Q3_K_AMPERE 128
1472
- #define MMQ_Y_Q3_K_AMPERE 128
1473
- #define NWARPS_Q3_K_AMPERE 4
1474
- #endif
1475
- #define MMQ_X_Q3_K_PASCAL 64
1476
- #define MMQ_Y_Q3_K_PASCAL 64
1477
- #define NWARPS_Q3_K_PASCAL 8
1478
-
1479
  template <bool need_check> static __global__ void
1480
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1481
  #if defined(RDNA3) || defined(RDNA2)
1482
- __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
1483
  #endif // defined(RDNA3) || defined(RDNA2)
1484
  #elif __CUDA_ARCH__ < CC_VOLTA
1485
- __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
1486
  #endif // __CUDA_ARCH__ < CC_VOLTA
1487
  mul_mat_q3_K(
1488
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1489
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1490
 
1491
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1492
- #if defined(RDNA3) || defined(RDNA2)
1493
- const int mmq_x = MMQ_X_Q3_K_RDNA2;
1494
- const int mmq_y = MMQ_Y_Q3_K_RDNA2;
1495
- const int nwarps = NWARPS_Q3_K_RDNA2;
1496
- #else
1497
- const int mmq_x = MMQ_X_Q3_K_RDNA1;
1498
- const int mmq_y = MMQ_Y_Q3_K_RDNA1;
1499
- const int nwarps = NWARPS_Q3_K_RDNA1;
1500
- #endif // defined(RDNA3) || defined(RDNA2)
1501
-
1502
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
1503
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1504
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1505
-
1506
- #elif __CUDA_ARCH__ >= CC_VOLTA
1507
- const int mmq_x = MMQ_X_Q3_K_AMPERE;
1508
- const int mmq_y = MMQ_Y_Q3_K_AMPERE;
1509
- const int nwarps = NWARPS_Q3_K_AMPERE;
1510
 
1511
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
1512
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1513
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1514
-
1515
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1516
- const int mmq_x = MMQ_X_Q3_K_PASCAL;
1517
- const int mmq_y = MMQ_Y_Q3_K_PASCAL;
1518
- const int nwarps = NWARPS_Q3_K_PASCAL;
1519
-
1520
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
1521
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1522
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1523
  #else
1524
  GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
1525
  NO_DEVICE_CODE;
1526
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1527
  }
1528
 
1529
- #define MMQ_X_Q4_K_RDNA2 64
1530
- #define MMQ_Y_Q4_K_RDNA2 128
1531
- #define NWARPS_Q4_K_RDNA2 8
1532
- #define MMQ_X_Q4_K_RDNA1 32
1533
- #define MMQ_Y_Q4_K_RDNA1 64
1534
- #define NWARPS_Q4_K_RDNA1 8
1535
- #if defined(CUDA_USE_TENSOR_CORES)
1536
- #define MMQ_X_Q4_K_AMPERE 4
1537
- #define MMQ_Y_Q4_K_AMPERE 32
1538
- #define NWARPS_Q4_K_AMPERE 4
1539
- #else
1540
- #define MMQ_X_Q4_K_AMPERE 64
1541
- #define MMQ_Y_Q4_K_AMPERE 128
1542
- #define NWARPS_Q4_K_AMPERE 4
1543
- #endif
1544
- #define MMQ_X_Q4_K_PASCAL 64
1545
- #define MMQ_Y_Q4_K_PASCAL 64
1546
- #define NWARPS_Q4_K_PASCAL 8
1547
-
1548
  template <bool need_check> static __global__ void
1549
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1550
  #if defined(RDNA3) || defined(RDNA2)
1551
- __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
1552
  #endif // defined(RDNA3) || defined(RDNA2)
1553
  #elif __CUDA_ARCH__ < CC_VOLTA
1554
- __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
1555
  #endif // __CUDA_ARCH__ < CC_VOLTA
1556
  mul_mat_q4_K(
1557
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1558
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1559
 
1560
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1561
- #if defined(RDNA3) || defined(RDNA2)
1562
- const int mmq_x = MMQ_X_Q4_K_RDNA2;
1563
- const int mmq_y = MMQ_Y_Q4_K_RDNA2;
1564
- const int nwarps = NWARPS_Q4_K_RDNA2;
1565
- #else
1566
- const int mmq_x = MMQ_X_Q4_K_RDNA1;
1567
- const int mmq_y = MMQ_Y_Q4_K_RDNA1;
1568
- const int nwarps = NWARPS_Q4_K_RDNA1;
1569
- #endif // defined(RDNA3) || defined(RDNA2)
1570
-
1571
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
1572
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1573
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1574
-
1575
- #elif __CUDA_ARCH__ >= CC_VOLTA
1576
- const int mmq_x = MMQ_X_Q4_K_AMPERE;
1577
- const int mmq_y = MMQ_Y_Q4_K_AMPERE;
1578
- const int nwarps = NWARPS_Q4_K_AMPERE;
1579
-
1580
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
1581
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1582
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1583
-
1584
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1585
- const int mmq_x = MMQ_X_Q4_K_PASCAL;
1586
- const int mmq_y = MMQ_Y_Q4_K_PASCAL;
1587
- const int nwarps = NWARPS_Q4_K_PASCAL;
1588
 
1589
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
1590
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1591
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1592
  #else
1593
  GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
1594
  NO_DEVICE_CODE;
1595
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1596
  }
1597
 
1598
- #define MMQ_X_Q5_K_RDNA2 64
1599
- #define MMQ_Y_Q5_K_RDNA2 128
1600
- #define NWARPS_Q5_K_RDNA2 8
1601
- #define MMQ_X_Q5_K_RDNA1 32
1602
- #define MMQ_Y_Q5_K_RDNA1 64
1603
- #define NWARPS_Q5_K_RDNA1 8
1604
- #if defined(CUDA_USE_TENSOR_CORES)
1605
- #define MMQ_X_Q5_K_AMPERE 4
1606
- #define MMQ_Y_Q5_K_AMPERE 32
1607
- #define NWARPS_Q5_K_AMPERE 4
1608
- #else
1609
- #define MMQ_X_Q5_K_AMPERE 64
1610
- #define MMQ_Y_Q5_K_AMPERE 128
1611
- #define NWARPS_Q5_K_AMPERE 4
1612
- #endif
1613
- #define MMQ_X_Q5_K_PASCAL 64
1614
- #define MMQ_Y_Q5_K_PASCAL 64
1615
- #define NWARPS_Q5_K_PASCAL 8
1616
-
1617
  template <bool need_check> static __global__ void
1618
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1619
  #if defined(RDNA3) || defined(RDNA2)
1620
- __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
1621
  #endif // defined(RDNA3) || defined(RDNA2)
1622
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1623
  mul_mat_q5_K(
1624
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1625
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1626
 
1627
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1628
- #if defined(RDNA3) || defined(RDNA2)
1629
- const int mmq_x = MMQ_X_Q5_K_RDNA2;
1630
- const int mmq_y = MMQ_Y_Q5_K_RDNA2;
1631
- const int nwarps = NWARPS_Q5_K_RDNA2;
1632
- #else
1633
- const int mmq_x = MMQ_X_Q5_K_RDNA1;
1634
- const int mmq_y = MMQ_Y_Q5_K_RDNA1;
1635
- const int nwarps = NWARPS_Q5_K_RDNA1;
1636
- #endif // defined(RDNA3) || defined(RDNA2)
1637
-
1638
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
1639
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1640
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1641
-
1642
- #elif __CUDA_ARCH__ >= CC_VOLTA
1643
- const int mmq_x = MMQ_X_Q5_K_AMPERE;
1644
- const int mmq_y = MMQ_Y_Q5_K_AMPERE;
1645
- const int nwarps = NWARPS_Q5_K_AMPERE;
1646
 
1647
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
1648
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1649
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1650
-
1651
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1652
- const int mmq_x = MMQ_X_Q5_K_PASCAL;
1653
- const int mmq_y = MMQ_Y_Q5_K_PASCAL;
1654
- const int nwarps = NWARPS_Q5_K_PASCAL;
1655
-
1656
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
1657
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1658
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1659
  #else
1660
  GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
1661
  NO_DEVICE_CODE;
1662
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1663
  }
1664
 
1665
- #define MMQ_X_Q6_K_RDNA2 64
1666
- #define MMQ_Y_Q6_K_RDNA2 128
1667
- #define NWARPS_Q6_K_RDNA2 8
1668
- #define MMQ_X_Q6_K_RDNA1 32
1669
- #define MMQ_Y_Q6_K_RDNA1 64
1670
- #define NWARPS_Q6_K_RDNA1 8
1671
- #if defined(CUDA_USE_TENSOR_CORES)
1672
- #define MMQ_X_Q6_K_AMPERE 4
1673
- #define MMQ_Y_Q6_K_AMPERE 32
1674
- #define NWARPS_Q6_K_AMPERE 4
1675
- #else
1676
- #define MMQ_X_Q6_K_AMPERE 64
1677
- #define MMQ_Y_Q6_K_AMPERE 64
1678
- #define NWARPS_Q6_K_AMPERE 4
1679
- #endif
1680
- #define MMQ_X_Q6_K_PASCAL 64
1681
- #define MMQ_Y_Q6_K_PASCAL 64
1682
- #define NWARPS_Q6_K_PASCAL 8
1683
-
1684
  template <bool need_check> static __global__ void
1685
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1686
  #if defined(RDNA3) || defined(RDNA2)
1687
- __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
1688
  #endif // defined(RDNA3) || defined(RDNA2)
1689
  #elif __CUDA_ARCH__ < CC_VOLTA
1690
- __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
1691
  #endif // __CUDA_ARCH__ < CC_VOLTA
1692
  mul_mat_q6_K(
1693
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1694
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1695
 
1696
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1697
- #if defined(RDNA3) || defined(RDNA2)
1698
- const int mmq_x = MMQ_X_Q6_K_RDNA2;
1699
- const int mmq_y = MMQ_Y_Q6_K_RDNA2;
1700
- const int nwarps = NWARPS_Q6_K_RDNA2;
1701
- #else
1702
- const int mmq_x = MMQ_X_Q6_K_RDNA1;
1703
- const int mmq_y = MMQ_Y_Q6_K_RDNA1;
1704
- const int nwarps = NWARPS_Q6_K_RDNA1;
1705
- #endif // defined(RDNA3) || defined(RDNA2)
1706
-
1707
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
1708
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1709
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1710
-
1711
- #elif __CUDA_ARCH__ >= CC_VOLTA
1712
- const int mmq_x = MMQ_X_Q6_K_AMPERE;
1713
- const int mmq_y = MMQ_Y_Q6_K_AMPERE;
1714
- const int nwarps = NWARPS_Q6_K_AMPERE;
1715
-
1716
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
1717
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1718
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1719
-
1720
- #elif __CUDA_ARCH__ >= MIN_CC_DP4A
1721
- const int mmq_x = MMQ_X_Q6_K_PASCAL;
1722
- const int mmq_y = MMQ_Y_Q6_K_PASCAL;
1723
- const int nwarps = NWARPS_Q6_K_PASCAL;
1724
 
1725
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
1726
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1727
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1728
  #else
1729
  GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
1730
  NO_DEVICE_CODE;
1731
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1732
  }
1733
 
1734
- static void ggml_mul_mat_q4_0_q8_1_cuda(
1735
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
1736
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
1737
-
1738
- int id = ggml_cuda_get_device();
1739
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1740
-
1741
- int mmq_x, mmq_y, nwarps;
1742
- if (compute_capability >= CC_RDNA2) {
1743
- mmq_x = MMQ_X_Q4_0_RDNA2;
1744
- mmq_y = MMQ_Y_Q4_0_RDNA2;
1745
- nwarps = NWARPS_Q4_0_RDNA2;
1746
- } else if (compute_capability >= CC_OFFSET_AMD) {
1747
- mmq_x = MMQ_X_Q4_0_RDNA1;
1748
- mmq_y = MMQ_Y_Q4_0_RDNA1;
1749
- nwarps = NWARPS_Q4_0_RDNA1;
1750
- } else if (compute_capability >= CC_VOLTA) {
1751
- mmq_x = MMQ_X_Q4_0_AMPERE;
1752
- mmq_y = MMQ_Y_Q4_0_AMPERE;
1753
- nwarps = NWARPS_Q4_0_AMPERE;
1754
- } else if (compute_capability >= MIN_CC_DP4A) {
1755
- mmq_x = MMQ_X_Q4_0_PASCAL;
1756
- mmq_y = MMQ_Y_Q4_0_PASCAL;
1757
- nwarps = NWARPS_Q4_0_PASCAL;
1758
- } else {
1759
- GGML_ASSERT(false);
1760
- }
1761
-
1762
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
1763
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
1764
- const dim3 block_nums(block_num_x, block_num_y, 1);
1765
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
1766
-
1767
- if (nrows_x % mmq_y == 0) {
1768
- const bool need_check = false;
1769
- mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
1770
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1771
- } else {
1772
- const bool need_check = true;
1773
- mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
1774
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1775
- }
1776
- }
1777
-
1778
- static void ggml_mul_mat_q4_1_q8_1_cuda(
1779
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
1780
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
1781
-
1782
- int id = ggml_cuda_get_device();
1783
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1784
-
1785
- int mmq_x, mmq_y, nwarps;
1786
- if (compute_capability >= CC_RDNA2) {
1787
- mmq_x = MMQ_X_Q4_1_RDNA2;
1788
- mmq_y = MMQ_Y_Q4_1_RDNA2;
1789
- nwarps = NWARPS_Q4_1_RDNA2;
1790
- } else if (compute_capability >= CC_OFFSET_AMD) {
1791
- mmq_x = MMQ_X_Q4_1_RDNA1;
1792
- mmq_y = MMQ_Y_Q4_1_RDNA1;
1793
- nwarps = NWARPS_Q4_1_RDNA1;
1794
- } else if (compute_capability >= CC_VOLTA) {
1795
- mmq_x = MMQ_X_Q4_1_AMPERE;
1796
- mmq_y = MMQ_Y_Q4_1_AMPERE;
1797
- nwarps = NWARPS_Q4_1_AMPERE;
1798
- } else if (compute_capability >= MIN_CC_DP4A) {
1799
- mmq_x = MMQ_X_Q4_1_PASCAL;
1800
- mmq_y = MMQ_Y_Q4_1_PASCAL;
1801
- nwarps = NWARPS_Q4_1_PASCAL;
1802
- } else {
1803
- GGML_ASSERT(false);
1804
- }
1805
-
1806
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
1807
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
1808
- const dim3 block_nums(block_num_x, block_num_y, 1);
1809
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
1810
-
1811
- if (nrows_x % mmq_y == 0) {
1812
- const bool need_check = false;
1813
- mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
1814
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1815
- } else {
1816
- const bool need_check = true;
1817
- mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
1818
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1819
- }
1820
- }
1821
-
1822
- static void ggml_mul_mat_q5_0_q8_1_cuda(
1823
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
1824
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
1825
-
1826
- int id = ggml_cuda_get_device();
1827
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1828
-
1829
- int mmq_x, mmq_y, nwarps;
1830
- if (compute_capability >= CC_RDNA2) {
1831
- mmq_x = MMQ_X_Q5_0_RDNA2;
1832
- mmq_y = MMQ_Y_Q5_0_RDNA2;
1833
- nwarps = NWARPS_Q5_0_RDNA2;
1834
- } else if (compute_capability >= CC_OFFSET_AMD) {
1835
- mmq_x = MMQ_X_Q5_0_RDNA1;
1836
- mmq_y = MMQ_Y_Q5_0_RDNA1;
1837
- nwarps = NWARPS_Q5_0_RDNA1;
1838
- } else if (compute_capability >= CC_VOLTA) {
1839
- mmq_x = MMQ_X_Q5_0_AMPERE;
1840
- mmq_y = MMQ_Y_Q5_0_AMPERE;
1841
- nwarps = NWARPS_Q5_0_AMPERE;
1842
- } else if (compute_capability >= MIN_CC_DP4A) {
1843
- mmq_x = MMQ_X_Q5_0_PASCAL;
1844
- mmq_y = MMQ_Y_Q5_0_PASCAL;
1845
- nwarps = NWARPS_Q5_0_PASCAL;
1846
- } else {
1847
- GGML_ASSERT(false);
1848
- }
1849
-
1850
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
1851
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
1852
- const dim3 block_nums(block_num_x, block_num_y, 1);
1853
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
1854
-
1855
- if (nrows_x % mmq_y == 0) {
1856
- const bool need_check = false;
1857
- mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
1858
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1859
- } else {
1860
- const bool need_check = true;
1861
- mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
1862
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1863
- }
1864
- }
1865
-
1866
- static void ggml_mul_mat_q5_1_q8_1_cuda(
1867
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
1868
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
1869
-
1870
- int id = ggml_cuda_get_device();
1871
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1872
-
1873
- int mmq_x, mmq_y, nwarps;
1874
- if (compute_capability >= CC_RDNA2) {
1875
- mmq_x = MMQ_X_Q5_1_RDNA2;
1876
- mmq_y = MMQ_Y_Q5_1_RDNA2;
1877
- nwarps = NWARPS_Q5_1_RDNA2;
1878
- } else if (compute_capability >= CC_OFFSET_AMD) {
1879
- mmq_x = MMQ_X_Q5_1_RDNA1;
1880
- mmq_y = MMQ_Y_Q5_1_RDNA1;
1881
- nwarps = NWARPS_Q5_1_RDNA1;
1882
- } else if (compute_capability >= CC_VOLTA) {
1883
- mmq_x = MMQ_X_Q5_1_AMPERE;
1884
- mmq_y = MMQ_Y_Q5_1_AMPERE;
1885
- nwarps = NWARPS_Q5_1_AMPERE;
1886
- } else if (compute_capability >= MIN_CC_DP4A) {
1887
- mmq_x = MMQ_X_Q5_1_PASCAL;
1888
- mmq_y = MMQ_Y_Q5_1_PASCAL;
1889
- nwarps = NWARPS_Q5_1_PASCAL;
1890
- } else {
1891
- GGML_ASSERT(false);
1892
- }
1893
-
1894
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
1895
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
1896
- const dim3 block_nums(block_num_x, block_num_y, 1);
1897
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
1898
-
1899
- if (nrows_x % mmq_y == 0) {
1900
- const bool need_check = false;
1901
- mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
1902
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1903
- } else {
1904
- const bool need_check = true;
1905
- mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
1906
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1907
- }
1908
- }
1909
-
1910
- static void ggml_mul_mat_q8_0_q8_1_cuda(
1911
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
1912
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
1913
-
1914
- int id = ggml_cuda_get_device();
1915
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1916
-
1917
- int mmq_x, mmq_y, nwarps;
1918
- if (compute_capability >= CC_RDNA2) {
1919
- mmq_x = MMQ_X_Q8_0_RDNA2;
1920
- mmq_y = MMQ_Y_Q8_0_RDNA2;
1921
- nwarps = NWARPS_Q8_0_RDNA2;
1922
- } else if (compute_capability >= CC_OFFSET_AMD) {
1923
- mmq_x = MMQ_X_Q8_0_RDNA1;
1924
- mmq_y = MMQ_Y_Q8_0_RDNA1;
1925
- nwarps = NWARPS_Q8_0_RDNA1;
1926
- } else if (compute_capability >= CC_VOLTA) {
1927
- mmq_x = MMQ_X_Q8_0_AMPERE;
1928
- mmq_y = MMQ_Y_Q8_0_AMPERE;
1929
- nwarps = NWARPS_Q8_0_AMPERE;
1930
- } else if (compute_capability >= MIN_CC_DP4A) {
1931
- mmq_x = MMQ_X_Q8_0_PASCAL;
1932
- mmq_y = MMQ_Y_Q8_0_PASCAL;
1933
- nwarps = NWARPS_Q8_0_PASCAL;
1934
- } else {
1935
- GGML_ASSERT(false);
1936
- }
1937
-
1938
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
1939
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
1940
- const dim3 block_nums(block_num_x, block_num_y, 1);
1941
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
1942
-
1943
- if (nrows_x % mmq_y == 0) {
1944
- const bool need_check = false;
1945
- mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
1946
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1947
- } else {
1948
- const bool need_check = true;
1949
- mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
1950
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1951
- }
1952
- }
1953
-
1954
- static void ggml_mul_mat_q2_K_q8_1_cuda(
1955
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
1956
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
1957
-
1958
- int id = ggml_cuda_get_device();
1959
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1960
-
1961
- int mmq_x, mmq_y, nwarps;
1962
- if (compute_capability >= CC_RDNA2) {
1963
- mmq_x = MMQ_X_Q2_K_RDNA2;
1964
- mmq_y = MMQ_Y_Q2_K_RDNA2;
1965
- nwarps = NWARPS_Q2_K_RDNA2;
1966
- } else if (compute_capability >= CC_OFFSET_AMD) {
1967
- mmq_x = MMQ_X_Q2_K_RDNA1;
1968
- mmq_y = MMQ_Y_Q2_K_RDNA1;
1969
- nwarps = NWARPS_Q2_K_RDNA1;
1970
- } else if (compute_capability >= CC_VOLTA) {
1971
- mmq_x = MMQ_X_Q2_K_AMPERE;
1972
- mmq_y = MMQ_Y_Q2_K_AMPERE;
1973
- nwarps = NWARPS_Q2_K_AMPERE;
1974
- } else if (compute_capability >= MIN_CC_DP4A) {
1975
- mmq_x = MMQ_X_Q2_K_PASCAL;
1976
- mmq_y = MMQ_Y_Q2_K_PASCAL;
1977
- nwarps = NWARPS_Q2_K_PASCAL;
1978
- } else {
1979
- GGML_ASSERT(false);
1980
- }
1981
-
1982
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
1983
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
1984
- const dim3 block_nums(block_num_x, block_num_y, 1);
1985
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
1986
-
1987
- if (nrows_x % mmq_y == 0) {
1988
- const bool need_check = false;
1989
- mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
1990
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1991
- } else {
1992
- const bool need_check = true;
1993
- mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
1994
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1995
- }
1996
- }
1997
-
1998
- static void ggml_mul_mat_q3_K_q8_1_cuda(
1999
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
2000
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
2001
-
2002
- #if QK_K == 256
2003
-
2004
- int id = ggml_cuda_get_device();
2005
- const int compute_capability = ggml_cuda_info().devices[id].cc;
2006
-
2007
- int mmq_x, mmq_y, nwarps;
2008
- if (compute_capability >= CC_RDNA2) {
2009
- mmq_x = MMQ_X_Q3_K_RDNA2;
2010
- mmq_y = MMQ_Y_Q3_K_RDNA2;
2011
- nwarps = NWARPS_Q3_K_RDNA2;
2012
- } else if (compute_capability >= CC_OFFSET_AMD) {
2013
- mmq_x = MMQ_X_Q3_K_RDNA1;
2014
- mmq_y = MMQ_Y_Q3_K_RDNA1;
2015
- nwarps = NWARPS_Q3_K_RDNA1;
2016
- } else if (compute_capability >= CC_VOLTA) {
2017
- mmq_x = MMQ_X_Q3_K_AMPERE;
2018
- mmq_y = MMQ_Y_Q3_K_AMPERE;
2019
- nwarps = NWARPS_Q3_K_AMPERE;
2020
- } else if (compute_capability >= MIN_CC_DP4A) {
2021
- mmq_x = MMQ_X_Q3_K_PASCAL;
2022
- mmq_y = MMQ_Y_Q3_K_PASCAL;
2023
- nwarps = NWARPS_Q3_K_PASCAL;
2024
- } else {
2025
- GGML_ASSERT(false);
2026
- }
2027
-
2028
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
2029
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
2030
- const dim3 block_nums(block_num_x, block_num_y, 1);
2031
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
2032
-
2033
- if (nrows_x % mmq_y == 0) {
2034
- const bool need_check = false;
2035
- mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
2036
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2037
- } else {
2038
- const bool need_check = true;
2039
- mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
2040
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2041
- }
2042
- #endif
2043
- }
2044
-
2045
- static void ggml_mul_mat_q4_K_q8_1_cuda(
2046
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
2047
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
2048
-
2049
- int id = ggml_cuda_get_device();
2050
- const int compute_capability = ggml_cuda_info().devices[id].cc;
2051
-
2052
- int mmq_x, mmq_y, nwarps;
2053
- if (compute_capability >= CC_RDNA2) {
2054
- mmq_x = MMQ_X_Q4_K_RDNA2;
2055
- mmq_y = MMQ_Y_Q4_K_RDNA2;
2056
- nwarps = NWARPS_Q4_K_RDNA2;
2057
- } else if (compute_capability >= CC_OFFSET_AMD) {
2058
- mmq_x = MMQ_X_Q4_K_RDNA1;
2059
- mmq_y = MMQ_Y_Q4_K_RDNA1;
2060
- nwarps = NWARPS_Q4_K_RDNA1;
2061
- } else if (compute_capability >= CC_VOLTA) {
2062
- mmq_x = MMQ_X_Q4_K_AMPERE;
2063
- mmq_y = MMQ_Y_Q4_K_AMPERE;
2064
- nwarps = NWARPS_Q4_K_AMPERE;
2065
- } else if (compute_capability >= MIN_CC_DP4A) {
2066
- mmq_x = MMQ_X_Q4_K_PASCAL;
2067
- mmq_y = MMQ_Y_Q4_K_PASCAL;
2068
- nwarps = NWARPS_Q4_K_PASCAL;
2069
- } else {
2070
- GGML_ASSERT(false);
2071
- }
2072
-
2073
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
2074
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
2075
- const dim3 block_nums(block_num_x, block_num_y, 1);
2076
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
2077
-
2078
- if (nrows_x % mmq_y == 0) {
2079
- const bool need_check = false;
2080
- mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
2081
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2082
- } else {
2083
- const bool need_check = true;
2084
- mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
2085
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2086
- }
2087
- }
2088
-
2089
- static void ggml_mul_mat_q5_K_q8_1_cuda(
2090
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
2091
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
2092
-
2093
- int id = ggml_cuda_get_device();
2094
- const int compute_capability = ggml_cuda_info().devices[id].cc;
2095
-
2096
- int mmq_x, mmq_y, nwarps;
2097
- if (compute_capability >= CC_RDNA2) {
2098
- mmq_x = MMQ_X_Q5_K_RDNA2;
2099
- mmq_y = MMQ_Y_Q5_K_RDNA2;
2100
- nwarps = NWARPS_Q5_K_RDNA2;
2101
- } else if (compute_capability >= CC_OFFSET_AMD) {
2102
- mmq_x = MMQ_X_Q5_K_RDNA1;
2103
- mmq_y = MMQ_Y_Q5_K_RDNA1;
2104
- nwarps = NWARPS_Q5_K_RDNA1;
2105
- } else if (compute_capability >= CC_VOLTA) {
2106
- mmq_x = MMQ_X_Q5_K_AMPERE;
2107
- mmq_y = MMQ_Y_Q5_K_AMPERE;
2108
- nwarps = NWARPS_Q5_K_AMPERE;
2109
- } else if (compute_capability >= MIN_CC_DP4A) {
2110
- mmq_x = MMQ_X_Q5_K_PASCAL;
2111
- mmq_y = MMQ_Y_Q5_K_PASCAL;
2112
- nwarps = NWARPS_Q5_K_PASCAL;
2113
- } else {
2114
- GGML_ASSERT(false);
2115
- }
2116
-
2117
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
2118
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
2119
- const dim3 block_nums(block_num_x, block_num_y, 1);
2120
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
2121
-
2122
- if (nrows_x % mmq_y == 0) {
2123
- const bool need_check = false;
2124
- mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
2125
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2126
- } else {
2127
- const bool need_check = true;
2128
- mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
2129
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2130
- }
2131
- }
2132
-
2133
- static void ggml_mul_mat_q6_K_q8_1_cuda(
2134
- const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
2135
- const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
2136
-
2137
- int id = ggml_cuda_get_device();
2138
- const int compute_capability = ggml_cuda_info().devices[id].cc;
2139
-
2140
- int mmq_x, mmq_y, nwarps;
2141
- if (compute_capability >= CC_RDNA2) {
2142
- mmq_x = MMQ_X_Q6_K_RDNA2;
2143
- mmq_y = MMQ_Y_Q6_K_RDNA2;
2144
- nwarps = NWARPS_Q6_K_RDNA2;
2145
- } else if (compute_capability >= CC_OFFSET_AMD) {
2146
- mmq_x = MMQ_X_Q6_K_RDNA1;
2147
- mmq_y = MMQ_Y_Q6_K_RDNA1;
2148
- nwarps = NWARPS_Q6_K_RDNA1;
2149
- } else if (compute_capability >= CC_VOLTA) {
2150
- mmq_x = MMQ_X_Q6_K_AMPERE;
2151
- mmq_y = MMQ_Y_Q6_K_AMPERE;
2152
- nwarps = NWARPS_Q6_K_AMPERE;
2153
- } else if (compute_capability >= MIN_CC_DP4A) {
2154
- mmq_x = MMQ_X_Q6_K_PASCAL;
2155
- mmq_y = MMQ_Y_Q6_K_PASCAL;
2156
- nwarps = NWARPS_Q6_K_PASCAL;
2157
- } else {
2158
- GGML_ASSERT(false);
2159
- }
2160
-
2161
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
2162
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
2163
- const dim3 block_nums(block_num_x, block_num_y, 1);
2164
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
2165
-
2166
- if (nrows_x % mmq_y == 0) {
2167
- const bool need_check = false;
2168
- mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
2169
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2170
- } else {
2171
- const bool need_check = true;
2172
- mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
2173
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
2174
- }
2175
- }
2176
 
2177
  void ggml_cuda_op_mul_mat_q(
2178
  ggml_backend_cuda_context & ctx,
@@ -2190,47 +1458,84 @@ void ggml_cuda_op_mul_mat_q(
2190
  const int64_t row_diff = row_high - row_low;
2191
 
2192
  int id = ggml_cuda_get_device();
 
2193
 
2194
  // the main device has a larger memory buffer to hold the results from all GPUs
2195
  // nrows_dst == nrows of the matrix that the kernel writes into
2196
  const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
2197
 
 
 
2198
  switch (src0->type) {
2199
  case GGML_TYPE_Q4_0:
2200
- ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2201
  break;
2202
  case GGML_TYPE_Q4_1:
2203
- ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2204
  break;
2205
  case GGML_TYPE_Q5_0:
2206
- ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2207
  break;
2208
  case GGML_TYPE_Q5_1:
2209
- ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2210
  break;
2211
  case GGML_TYPE_Q8_0:
2212
- ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2213
  break;
2214
  case GGML_TYPE_Q2_K:
2215
- ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2216
  break;
2217
  case GGML_TYPE_Q3_K:
2218
- ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2219
  break;
2220
  case GGML_TYPE_Q4_K:
2221
- ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2222
  break;
2223
  case GGML_TYPE_Q5_K:
2224
- ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2225
  break;
2226
  case GGML_TYPE_Q6_K:
2227
- ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
2228
  break;
2229
  default:
2230
  GGML_ASSERT(false);
2231
  break;
2232
  }
2233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2234
  GGML_UNUSED(src1);
2235
  GGML_UNUSED(dst);
2236
  GGML_UNUSED(src1_ddf_i);
 
9
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
10
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
11
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
12
+ typedef void (mul_mat_q_t)(
13
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
14
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
15
+
16
+ struct mmq_arch_config_t {
17
+ int x;
18
+ int y;
19
+ int nwarps;
20
+ };
21
+
22
+ struct mmq_config_t {
23
+ mmq_arch_config_t rdna2;
24
+ mmq_arch_config_t rdna1;
25
+ mmq_arch_config_t ampere;
26
+ mmq_arch_config_t pascal;
27
+ };
28
+
29
+ constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
30
+ // x y nwarps
31
+ { 64, 128, 8},
32
+ { 64, 64, 8},
33
+ #ifdef CUDA_USE_TENSOR_CORES
34
+ { 4, 32, 4},
35
+ #else
36
+ { 64, 128, 4},
37
+ #endif // CUDA_USE_TENSOR_CORES
38
+ { 64, 64, 8},
39
+ };
40
+ constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
41
+ // x y nwarps
42
+ { 64, 128, 8},
43
+ { 64, 64, 8},
44
+ #ifdef CUDA_USE_TENSOR_CORES
45
+ { 4, 32, 4},
46
+ #else
47
+ { 64, 128, 4},
48
+ #endif // CUDA_USE_TENSOR_CORES
49
+ { 64, 64, 8},
50
+ };
51
+ constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
52
+ // x y nwarps
53
+ { 64, 128, 8},
54
+ { 64, 64, 8},
55
+ #ifdef CUDA_USE_TENSOR_CORES
56
+ { 4, 32, 4},
57
+ #else
58
+ {128, 64, 4},
59
+ #endif // CUDA_USE_TENSOR_CORES
60
+ { 64, 64, 8},
61
+ };
62
+ constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
63
+ // x y nwarps
64
+ { 64, 128, 8},
65
+ { 64, 64, 8},
66
+ #ifdef CUDA_USE_TENSOR_CORES
67
+ { 4, 32, 4},
68
+ #else
69
+ {128, 64, 4},
70
+ #endif // CUDA_USE_TENSOR_CORES
71
+ { 64, 64, 8},
72
+ };
73
+ constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
74
+ // x y nwarps
75
+ { 64, 128, 8},
76
+ { 64, 64, 8},
77
+ #ifdef CUDA_USE_TENSOR_CORES
78
+ { 4, 32, 4},
79
+ #else
80
+ {128, 64, 4},
81
+ #endif // CUDA_USE_TENSOR_CORES
82
+ { 64, 64, 8},
83
+ };
84
+ constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
85
+ // x y nwarps
86
+ { 64, 128, 8},
87
+ {128, 32, 8},
88
+ #ifdef CUDA_USE_TENSOR_CORES
89
+ { 4, 32, 4},
90
+ #else
91
+ { 64, 128, 4},
92
+ #endif // CUDA_USE_TENSOR_CORES
93
+ { 64, 64, 8},
94
+ };
95
+ constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
96
+ // x y nwarps
97
+ {128, 64, 8},
98
+ { 32, 128, 8},
99
+ #ifdef CUDA_USE_TENSOR_CORES
100
+ { 4, 32, 4},
101
+ #else
102
+ {128, 128, 4},
103
+ #endif // CUDA_USE_TENSOR_CORES
104
+ { 64, 64, 8},
105
+ };
106
+ constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
107
+ // x y nwarps
108
+ { 64, 128, 8},
109
+ { 32, 64, 8},
110
+ #ifdef CUDA_USE_TENSOR_CORES
111
+ { 4, 32, 4},
112
+ #else
113
+ { 64, 128, 4},
114
+ #endif // CUDA_USE_TENSOR_CORES
115
+ { 64, 64, 8},
116
+ };
117
+ constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
118
+ // x y nwarps
119
+ { 64, 128, 8},
120
+ { 32, 64, 8},
121
+ #ifdef CUDA_USE_TENSOR_CORES
122
+ { 4, 32, 4},
123
+ #else
124
+ { 64, 128, 4},
125
+ #endif // CUDA_USE_TENSOR_CORES
126
+ { 64, 64, 8},
127
+ };
128
+ constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
129
+ // x y nwarps
130
+ { 64, 128, 8},
131
+ { 32, 64, 8},
132
+ #ifdef CUDA_USE_TENSOR_CORES
133
+ { 4, 32, 4},
134
+ #else
135
+ { 64, 64, 4},
136
+ #endif // CUDA_USE_TENSOR_CORES
137
+ { 64, 64, 8},
138
+ };
139
+
140
+ // ------------------------------------------------------------
141
 
142
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
143
  GGML_UNUSED(x_qh);
 
1072
  return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
1073
  }
1074
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1075
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
1076
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
1077
  static __device__ __forceinline__ void mul_mat_q(
 
1182
  }
1183
  }
1184
 
1185
+ static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
 
 
 
 
 
 
 
 
1186
 
1187
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1188
+
1189
  #if defined(RDNA3) || defined(RDNA2)
1190
+ return mmq_config.rdna2;
 
 
1191
  #else
1192
+ return mmq_config.rdna1;
 
 
1193
  #endif // defined(RDNA3) || defined(RDNA2)
1194
 
1195
+ #else
 
 
1196
 
1197
+ #if __CUDA_ARCH__ >= CC_VOLTA
1198
+ return mmq_config.ampere;
1199
+ #else
1200
+ return mmq_config.pascal;
1201
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
1202
 
1203
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1204
+ }
 
1205
 
1206
+ template <bool need_check> static __global__ void
1207
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1208
+ #if defined(RDNA3) || defined(RDNA2)
1209
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
1210
+ #endif // defined(RDNA3) || defined(RDNA2)
1211
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1212
+ mul_mat_q4_0(
1213
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1214
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1215
+
1216
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1217
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
1218
 
1219
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
1220
+ load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1221
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1222
  #else
1223
  GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
1224
  NO_DEVICE_CODE;
1225
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1226
  }
1227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1228
  template <bool need_check> static __global__ void
1229
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1230
  #if defined(RDNA3) || defined(RDNA2)
1231
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
1232
  #endif // defined(RDNA3) || defined(RDNA2)
1233
  #elif __CUDA_ARCH__ < CC_VOLTA
1234
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
1235
  #endif // __CUDA_ARCH__ < CC_VOLTA
1236
  mul_mat_q4_1(
1237
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1238
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1239
 
1240
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1241
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1242
 
1243
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
1244
+ load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1245
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1246
  #else
1247
  GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
1248
  NO_DEVICE_CODE;
1249
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1250
  }
1251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1252
  template <bool need_check> static __global__ void
1253
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1254
  #if defined(RDNA3) || defined(RDNA2)
1255
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
1256
  #endif // defined(RDNA3) || defined(RDNA2)
1257
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1258
  mul_mat_q5_0(
1259
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1260
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1261
 
1262
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1263
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
 
 
 
 
 
 
 
 
1264
 
1265
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
1266
+ load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1267
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1268
  #else
1269
  GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
1270
  NO_DEVICE_CODE;
1271
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1272
  }
1273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274
  template <bool need_check> static __global__ void
1275
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1276
  #if defined(RDNA3) || defined(RDNA2)
1277
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
1278
  #endif // defined(RDNA3) || defined(RDNA2)
1279
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1280
  mul_mat_q5_1(
1281
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1282
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1283
 
1284
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1285
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1286
 
1287
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
1288
+ load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 
 
 
 
 
 
 
 
 
1289
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1290
  #else
1291
  GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
1292
  NO_DEVICE_CODE;
1293
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1294
  }
1295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1296
  template <bool need_check> static __global__ void
1297
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1298
  #if defined(RDNA3) || defined(RDNA2)
1299
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
1300
  #endif // defined(RDNA3) || defined(RDNA2)
1301
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1302
  mul_mat_q8_0(
1303
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1304
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1305
 
1306
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1307
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1308
 
1309
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
1310
+ load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1311
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1312
  #else
1313
  GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
1314
  NO_DEVICE_CODE;
1315
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1316
  }
1317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  template <bool need_check> static __global__ void
1319
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1320
  #if defined(RDNA3) || defined(RDNA2)
1321
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
1322
  #endif // defined(RDNA3) || defined(RDNA2)
1323
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1324
  mul_mat_q2_K(
1325
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1326
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1327
 
1328
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1329
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
 
 
 
 
 
 
 
 
1330
 
1331
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
1332
+ load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1334
  #else
1335
  GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
1336
  NO_DEVICE_CODE;
1337
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1338
  }
1339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1340
  template <bool need_check> static __global__ void
1341
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1342
  #if defined(RDNA3) || defined(RDNA2)
1343
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
1344
  #endif // defined(RDNA3) || defined(RDNA2)
1345
  #elif __CUDA_ARCH__ < CC_VOLTA
1346
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
1347
  #endif // __CUDA_ARCH__ < CC_VOLTA
1348
  mul_mat_q3_K(
1349
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1350
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1351
 
1352
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1353
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1354
 
1355
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
1356
+ load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 
 
 
 
 
 
 
 
 
1357
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1358
  #else
1359
  GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
1360
  NO_DEVICE_CODE;
1361
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1362
  }
1363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1364
  template <bool need_check> static __global__ void
1365
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1366
  #if defined(RDNA3) || defined(RDNA2)
1367
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
1368
  #endif // defined(RDNA3) || defined(RDNA2)
1369
  #elif __CUDA_ARCH__ < CC_VOLTA
1370
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1371
  #endif // __CUDA_ARCH__ < CC_VOLTA
1372
  mul_mat_q4_K(
1373
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1374
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1375
 
1376
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1377
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1378
 
1379
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
1380
+ load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1381
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1382
  #else
1383
  GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
1384
  NO_DEVICE_CODE;
1385
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1386
  }
1387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1388
  template <bool need_check> static __global__ void
1389
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1390
  #if defined(RDNA3) || defined(RDNA2)
1391
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
1392
  #endif // defined(RDNA3) || defined(RDNA2)
1393
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1394
  mul_mat_q5_K(
1395
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1396
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1397
 
1398
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1399
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1400
 
1401
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
1402
+ load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 
 
 
 
 
 
 
 
 
1403
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1404
  #else
1405
  GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
1406
  NO_DEVICE_CODE;
1407
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1408
  }
1409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1410
  template <bool need_check> static __global__ void
1411
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1412
  #if defined(RDNA3) || defined(RDNA2)
1413
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
1414
  #endif // defined(RDNA3) || defined(RDNA2)
1415
  #elif __CUDA_ARCH__ < CC_VOLTA
1416
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1417
  #endif // __CUDA_ARCH__ < CC_VOLTA
1418
  mul_mat_q6_K(
1419
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1420
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1421
 
1422
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1423
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1424
 
1425
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
1426
+ load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1427
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1428
  #else
1429
  GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
1430
  NO_DEVICE_CODE;
1431
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1432
  }
1433
 
1434
+ #define MMQ_SWITCH_CASE(type_suffix) \
1435
+ case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
1436
+ const bool need_check = false; \
1437
+ mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1438
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1439
+ } else { \
1440
+ const bool need_check = true; \
1441
+ mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1442
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1443
+ } break; \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1444
 
1445
  void ggml_cuda_op_mul_mat_q(
1446
  ggml_backend_cuda_context & ctx,
 
1458
  const int64_t row_diff = row_high - row_low;
1459
 
1460
  int id = ggml_cuda_get_device();
1461
+ const int compute_capability = ggml_cuda_info().devices[id].cc;
1462
 
1463
  // the main device has a larger memory buffer to hold the results from all GPUs
1464
  // nrows_dst == nrows of the matrix that the kernel writes into
1465
  const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
1466
 
1467
+ mmq_config_t mmq_config;
1468
+
1469
  switch (src0->type) {
1470
  case GGML_TYPE_Q4_0:
1471
+ mmq_config = MMQ_CONFIG_Q4_0;
1472
  break;
1473
  case GGML_TYPE_Q4_1:
1474
+ mmq_config = MMQ_CONFIG_Q4_1;
1475
  break;
1476
  case GGML_TYPE_Q5_0:
1477
+ mmq_config = MMQ_CONFIG_Q5_0;
1478
  break;
1479
  case GGML_TYPE_Q5_1:
1480
+ mmq_config = MMQ_CONFIG_Q5_1;
1481
  break;
1482
  case GGML_TYPE_Q8_0:
1483
+ mmq_config = MMQ_CONFIG_Q8_0;
1484
  break;
1485
  case GGML_TYPE_Q2_K:
1486
+ mmq_config = MMQ_CONFIG_Q2_K;
1487
  break;
1488
  case GGML_TYPE_Q3_K:
1489
+ mmq_config = MMQ_CONFIG_Q3_K;
1490
  break;
1491
  case GGML_TYPE_Q4_K:
1492
+ mmq_config = MMQ_CONFIG_Q4_K;
1493
  break;
1494
  case GGML_TYPE_Q5_K:
1495
+ mmq_config = MMQ_CONFIG_Q5_K;
1496
  break;
1497
  case GGML_TYPE_Q6_K:
1498
+ mmq_config = MMQ_CONFIG_Q6_K;
1499
  break;
1500
  default:
1501
  GGML_ASSERT(false);
1502
  break;
1503
  }
1504
 
1505
+ mmq_arch_config_t arch_config;
1506
+ if (compute_capability >= CC_RDNA2) {
1507
+ arch_config = mmq_config.rdna2;
1508
+ } else if (compute_capability >= CC_OFFSET_AMD) {
1509
+ arch_config = mmq_config.rdna1;
1510
+ } else if (compute_capability >= CC_VOLTA) {
1511
+ arch_config = mmq_config.ampere;
1512
+ } else if (compute_capability >= MIN_CC_DP4A) {
1513
+ arch_config = mmq_config.pascal;
1514
+ } else {
1515
+ GGML_ASSERT(false);
1516
+ }
1517
+
1518
+ const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
1519
+ const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
1520
+ const dim3 block_nums(block_num_x, block_num_y, 1);
1521
+ const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
1522
+
1523
+ switch (src0->type) {
1524
+ MMQ_SWITCH_CASE(4_0)
1525
+ MMQ_SWITCH_CASE(4_1)
1526
+ MMQ_SWITCH_CASE(5_0)
1527
+ MMQ_SWITCH_CASE(5_1)
1528
+ MMQ_SWITCH_CASE(8_0)
1529
+ MMQ_SWITCH_CASE(2_K)
1530
+ MMQ_SWITCH_CASE(3_K)
1531
+ MMQ_SWITCH_CASE(4_K)
1532
+ MMQ_SWITCH_CASE(5_K)
1533
+ MMQ_SWITCH_CASE(6_K)
1534
+ default:
1535
+ GGML_ASSERT(false);
1536
+ break;
1537
+ }
1538
+
1539
  GGML_UNUSED(src1);
1540
  GGML_UNUSED(dst);
1541
  GGML_UNUSED(src1_ddf_i);