Spaces:
Running
Running
Commit
·
acfd94f
1
Parent(s):
ed08269
HIP: fix flash_attn_stream_k_fixup warning (llama/11604)
Browse files
ggml/src/ggml-cuda/fattn-common.cuh
CHANGED
|
@@ -516,6 +516,12 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
|
| 516 |
nullptr;
|
| 517 |
}
|
| 518 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
template<int D, int ncols, int KQ_stride> // D == head size
|
| 520 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 521 |
__launch_bounds__(D, 1)
|
|
@@ -614,6 +620,10 @@ static __global__ void flash_attn_stream_k_fixup(
|
|
| 614 |
}
|
| 615 |
}
|
| 616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
template<int D, int parallel_blocks> // D == head size
|
| 618 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 619 |
__launch_bounds__(D, 1)
|
|
|
|
| 516 |
nullptr;
|
| 517 |
}
|
| 518 |
|
| 519 |
+
// The HIP compiler for some reason complains that it can't unroll a loop because of the jt*ncols + j >= ne01 conditional.
|
| 520 |
+
#ifdef __clang__
|
| 521 |
+
#pragma clang diagnostic push
|
| 522 |
+
#pragma clang diagnostic ignored "-Wpass-failed"
|
| 523 |
+
#endif // __clang__
|
| 524 |
+
|
| 525 |
template<int D, int ncols, int KQ_stride> // D == head size
|
| 526 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 527 |
__launch_bounds__(D, 1)
|
|
|
|
| 620 |
}
|
| 621 |
}
|
| 622 |
|
| 623 |
+
#ifdef __clang__
|
| 624 |
+
#pragma clang diagnostic pop
|
| 625 |
+
#endif // __clang__
|
| 626 |
+
|
| 627 |
template<int D, int parallel_blocks> // D == head size
|
| 628 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
| 629 |
__launch_bounds__(D, 1)
|
ggml/src/ggml-cuda/softmax.cu
CHANGED
|
@@ -18,7 +18,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
|
|
| 18 |
#ifdef __clang__
|
| 19 |
#pragma clang diagnostic push
|
| 20 |
#pragma clang diagnostic ignored "-Wpass-failed"
|
| 21 |
-
#endif
|
| 22 |
template <bool use_shared, int ncols_template, int block_size_template, typename T>
|
| 23 |
static __global__ void soft_max_f32(
|
| 24 |
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
|
|
@@ -126,7 +126,7 @@ static __global__ void soft_max_f32(
|
|
| 126 |
}
|
| 127 |
#ifdef __clang__
|
| 128 |
#pragma clang diagnostic pop
|
| 129 |
-
#endif
|
| 130 |
|
| 131 |
static __global__ void soft_max_back_f32(
|
| 132 |
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
|
|
|
|
| 18 |
#ifdef __clang__
|
| 19 |
#pragma clang diagnostic push
|
| 20 |
#pragma clang diagnostic ignored "-Wpass-failed"
|
| 21 |
+
#endif // __clang__
|
| 22 |
template <bool use_shared, int ncols_template, int block_size_template, typename T>
|
| 23 |
static __global__ void soft_max_f32(
|
| 24 |
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
|
|
|
|
| 126 |
}
|
| 127 |
#ifdef __clang__
|
| 128 |
#pragma clang diagnostic pop
|
| 129 |
+
#endif // __clang__
|
| 130 |
|
| 131 |
static __global__ void soft_max_back_f32(
|
| 132 |
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
|