Spaces:
Running
Running
uvos
commited on
Commit
·
4850c24
1
Parent(s):
f328957
HIP: add GGML_CUDA_CC_IS_* for amd familys as increasing cc archtectures for amd gpus are not supersets of eatch other (llama/11601)
Browse files
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -61,6 +61,13 @@
|
|
| 61 |
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
| 62 |
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
#define GGML_CUDA_CC_QY1 210
|
| 65 |
#define GGML_CUDA_CC_QY2 220
|
| 66 |
|
|
|
|
| 61 |
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
| 62 |
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
| 63 |
|
| 64 |
+
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
| 65 |
+
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
| 66 |
+
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
| 67 |
+
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
|
| 68 |
+
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
| 69 |
+
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
| 70 |
+
|
| 71 |
#define GGML_CUDA_CC_QY1 210
|
| 72 |
#define GGML_CUDA_CC_QY2 220
|
| 73 |
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1205,7 +1205,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1205 |
|
| 1206 |
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
| 1207 |
|
| 1208 |
-
if (compute_capability
|
| 1209 |
const float alpha = 1.0f;
|
| 1210 |
const float beta = 0.0f;
|
| 1211 |
CUBLAS_CHECK(
|
|
@@ -1750,7 +1750,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1750 |
beta = &beta_f32;
|
| 1751 |
}
|
| 1752 |
|
| 1753 |
-
if (ggml_cuda_info().devices[ctx.device].cc
|
| 1754 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1755 |
alpha = &alpha_f32;
|
| 1756 |
beta = &beta_f32;
|
|
|
|
| 1205 |
|
| 1206 |
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
| 1207 |
|
| 1208 |
+
if (GGML_CUDA_CC_IS_CDNA(compute_capability)) {
|
| 1209 |
const float alpha = 1.0f;
|
| 1210 |
const float beta = 0.0f;
|
| 1211 |
CUBLAS_CHECK(
|
|
|
|
| 1750 |
beta = &beta_f32;
|
| 1751 |
}
|
| 1752 |
|
| 1753 |
+
if (GGML_CUDA_CC_IS_CDNA(ggml_cuda_info().devices[ctx.device].cc)) {
|
| 1754 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1755 |
alpha = &alpha_f32;
|
| 1756 |
beta = &beta_f32;
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
| 148 |
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 149 |
}
|
| 150 |
|
| 151 |
-
return (cc
|
| 152 |
}
|
|
|
|
| 148 |
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 149 |
}
|
| 150 |
|
| 151 |
+
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 152 |
}
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -120,7 +120,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|
| 120 |
}
|
| 121 |
|
| 122 |
static constexpr int get_mmq_y_host(const int cc) {
|
| 123 |
-
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc
|
| 124 |
}
|
| 125 |
|
| 126 |
static constexpr __device__ int get_mmq_y_device() {
|
|
|
|
| 120 |
}
|
| 121 |
|
| 122 |
static constexpr int get_mmq_y_host(const int cc) {
|
| 123 |
+
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
|
| 124 |
}
|
| 125 |
|
| 126 |
static constexpr __device__ int get_mmq_y_device() {
|