Alberto Cabrera Pérez commited on
Commit
d3d52a4
·
1 Parent(s): 17d67da

sycl: fixed semantics of block offset calculation (llama/14814)

Browse files
ggml/src/ggml-sycl/quants.hpp CHANGED
@@ -48,11 +48,11 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
48
  };
49
 
50
  static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
51
- return { block_index * (traits::qk / traits::qr), 0 };
52
  }
53
 
54
  static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
55
- return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
56
  }
57
 
58
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -71,14 +71,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
71
  }
72
 
73
  static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
74
- auto nblocks = (nrows * (ncols / traits::qk));
75
- return { nblocks * (QK_K / 2),
76
  (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
77
  }
78
 
79
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
80
-
81
- constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
82
  };
83
 
84
  template <> struct block_q_t<GGML_TYPE_Q6_K> {
@@ -90,22 +88,23 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
90
  };
91
 
92
  static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
93
- auto low_bits_index = block_index * (traits::qk / traits::qr);
94
  // the index of high bits it's after all low bits
95
  auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
96
  return { low_bits_index, high_bits_index };
97
  }
98
 
99
  static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
100
- auto nblocks = (nrows * (ncols / traits::qk));
101
  auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
102
  auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
103
- auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
104
  return { block_scales, sb_scale };
105
  }
106
 
107
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
108
  };
 
109
  } // namespace ggml_sycl_reordered
110
 
111
  #endif // GGML_SYCL_QUANTS_HPP
 
48
  };
49
 
50
  static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
51
+ return { block_index * (QK4_0 / QR4_0), 0 };
52
  }
53
 
54
  static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
55
+ return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
56
  }
57
 
58
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 
71
  }
72
 
73
  static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
74
+ auto nblocks = (nrows * (ncols / QK_K));
75
+ return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
76
  (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
77
  }
78
 
79
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 
 
80
  };
81
 
82
  template <> struct block_q_t<GGML_TYPE_Q6_K> {
 
88
  };
89
 
90
  static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
91
+ auto low_bits_index = block_index * (QK_K / QR6_K);
92
  // the index of high bits it's after all low bits
93
  auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
94
  return { low_bits_index, high_bits_index };
95
  }
96
 
97
  static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
98
+ auto nblocks = (nrows * (ncols / QK_K));
99
  auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
100
  auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
101
+ auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
102
  return { block_scales, sb_scale };
103
  }
104
 
105
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
106
  };
107
+
108
  } // namespace ggml_sycl_reordered
109
 
110
  #endif // GGML_SYCL_QUANTS_HPP
ggml/src/ggml-sycl/vecdotq.hpp CHANGED
@@ -350,11 +350,9 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
350
  __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
351
  const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
352
  const sycl::half2 * q8_1_ds, const int & iqs) {
353
- const int ib = ibx_offset.first / (QK_K / 2);
354
-
355
  const uint8_t * base = static_cast<const uint8_t *>(vbq);
356
  const uint8_t * qs = base + ibx_offset.first;
357
- const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE;
358
  const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
359
 
360
  const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
@@ -427,13 +425,11 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
427
  __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
428
  const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
429
  const int iqs) {
430
- const int ib = ibx_offset.first / (QK_K / 2);
431
-
432
  const uint8_t * base = static_cast<const uint8_t *>(vbq);
433
  const uint8_t * ql = base + ibx_offset.first;
434
  const uint8_t * qh = base + ibx_offset.second;
435
  const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
436
- const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib;
437
 
438
  const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
439
  const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
 
350
  __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
351
  const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
352
  const sycl::half2 * q8_1_ds, const int & iqs) {
 
 
353
  const uint8_t * base = static_cast<const uint8_t *>(vbq);
354
  const uint8_t * qs = base + ibx_offset.first;
355
+ const uint8_t * scs = base + d_offset.first;
356
  const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
357
 
358
  const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
 
425
  __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
426
  const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
427
  const int iqs) {
 
 
428
  const uint8_t * base = static_cast<const uint8_t *>(vbq);
429
  const uint8_t * ql = base + ibx_offset.first;
430
  const uint8_t * qh = base + ibx_offset.second;
431
  const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
432
+ const ggml_half * d = (const ggml_half *) (base + d_offset.second);
433
 
434
  const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
435
  const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);