Spaces:
Running
Running
Haus1
commited on
Commit
·
04b01d8
1
Parent(s):
4e38ed4
AMD: parse the architecture as supplied by gcnArchName (llama/11244)
Browse filesThe value provided by minor doesn't include stepping for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID.
- ggml/src/ggml-cuda/common.cuh +10 -10
- ggml/src/ggml-cuda/ggml-cuda.cu +65 -2
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -46,20 +46,20 @@
|
|
| 46 |
#define GGML_CUDA_CC_VOLTA 700
|
| 47 |
#define GGML_CUDA_CC_TURING 750
|
| 48 |
#define GGML_CUDA_CC_AMPERE 800
|
| 49 |
-
#define GGML_CUDA_CC_OFFSET_AMD
|
| 50 |
|
| 51 |
// GCN/CNDA, wave size is 64
|
| 52 |
-
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD +
|
| 53 |
-
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD +
|
| 54 |
-
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD +
|
| 55 |
-
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD +
|
| 56 |
-
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD +
|
| 57 |
-
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD +
|
| 58 |
|
| 59 |
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
| 60 |
-
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD +
|
| 61 |
-
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD +
|
| 62 |
-
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD +
|
| 63 |
|
| 64 |
#define GGML_CUDA_CC_QY1 210
|
| 65 |
#define GGML_CUDA_CC_QY2 220
|
|
|
|
| 46 |
#define GGML_CUDA_CC_VOLTA 700
|
| 47 |
#define GGML_CUDA_CC_TURING 750
|
| 48 |
#define GGML_CUDA_CC_AMPERE 800
|
| 49 |
+
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
|
| 50 |
|
| 51 |
// GCN/CNDA, wave size is 64
|
| 52 |
+
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
| 53 |
+
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
| 54 |
+
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
| 55 |
+
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
| 56 |
+
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
| 57 |
+
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
| 58 |
|
| 59 |
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
| 60 |
+
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
| 61 |
+
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
| 62 |
+
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
| 63 |
|
| 64 |
#define GGML_CUDA_CC_QY1 210
|
| 65 |
#define GGML_CUDA_CC_QY2 220
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -119,6 +119,55 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|
| 119 |
#endif
|
| 120 |
}
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
static ggml_cuda_device_info ggml_cuda_init() {
|
| 123 |
#ifdef __HIP_PLATFORM_AMD__
|
| 124 |
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
|
@@ -169,7 +218,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 169 |
|
| 170 |
cudaDeviceProp prop;
|
| 171 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 172 |
-
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 173 |
|
| 174 |
info.default_tensor_split[id] = total_vram;
|
| 175 |
total_vram += prop.totalGlobalMem;
|
|
@@ -178,10 +226,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 178 |
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 179 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 180 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
#else
|
| 183 |
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 184 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
|
|
|
|
|
| 185 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 186 |
}
|
| 187 |
|
|
|
|
| 119 |
#endif
|
| 120 |
}
|
| 121 |
|
| 122 |
+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 123 |
+
static int ggml_cuda_parse_id(char devName[]) {
|
| 124 |
+
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
|
| 125 |
+
// these values are not stable so this is susceptible to breakage
|
| 126 |
+
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
|
| 127 |
+
int archMajor = 0x0;
|
| 128 |
+
int archMinor = 0x0;
|
| 129 |
+
int archNum = GGML_CUDA_CC_OFFSET_AMD;
|
| 130 |
+
int archLen = strlen(devName);
|
| 131 |
+
char archName[archLen + 1];
|
| 132 |
+
|
| 133 |
+
// strip leading 'gfx' while copying into our buffer
|
| 134 |
+
if (archLen > 3) {
|
| 135 |
+
strcpy(archName, &devName[3]);
|
| 136 |
+
archLen -= 3;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
// trim trailing :xnack- or :sramecc- statuses
|
| 140 |
+
archLen = strcspn(archName, ":");
|
| 141 |
+
archName[archLen] = '\0';
|
| 142 |
+
|
| 143 |
+
// tease out the version information
|
| 144 |
+
if (archLen > 8) {
|
| 145 |
+
// versions labeled generic use '-' as delimiter
|
| 146 |
+
// strip the trailing "-generic" then iterate through what remains
|
| 147 |
+
if ((strstr(archName, "-generic"))) {
|
| 148 |
+
archName[archLen - 8] = '\0';
|
| 149 |
+
char * pch;
|
| 150 |
+
if ((pch = strtok(archName, "-"))) {
|
| 151 |
+
archMajor = (int)strtoul(pch, 0, 16);
|
| 152 |
+
if ((pch = strtok(NULL, "-"))) {
|
| 153 |
+
archMinor = 0x10 * (int)strtoul(pch, 0, 16);
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
} else if (archLen >= 3) {
|
| 158 |
+
// last two digits should be the minor * 0x10 + stepping
|
| 159 |
+
archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
|
| 160 |
+
archName[archLen - 2] = '\0';
|
| 161 |
+
|
| 162 |
+
// only the major version remains
|
| 163 |
+
archMajor = (int)strtoul(archName, 0, 16);
|
| 164 |
+
}
|
| 165 |
+
archNum += archMajor * 0x100;
|
| 166 |
+
archNum += archMinor;
|
| 167 |
+
return archNum;
|
| 168 |
+
}
|
| 169 |
+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 170 |
+
|
| 171 |
static ggml_cuda_device_info ggml_cuda_init() {
|
| 172 |
#ifdef __HIP_PLATFORM_AMD__
|
| 173 |
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
|
|
|
| 218 |
|
| 219 |
cudaDeviceProp prop;
|
| 220 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
|
|
|
| 221 |
|
| 222 |
info.default_tensor_split[id] = total_vram;
|
| 223 |
total_vram += prop.totalGlobalMem;
|
|
|
|
| 226 |
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 227 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 228 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 229 |
+
|
| 230 |
+
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
|
| 231 |
+
if ((info.devices[id].cc & 0xff00) == 0x0) {
|
| 232 |
+
GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
|
| 233 |
+
id, prop.name, prop.gcnArchName, prop.major, prop.minor);
|
| 234 |
+
|
| 235 |
+
// Fallback to prop.major and prop.minor
|
| 236 |
+
if (prop.major > 0) {
|
| 237 |
+
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
|
| 238 |
+
info.devices[id].cc += prop.minor * 0x10;
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
|
| 242 |
+
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
|
| 243 |
#else
|
| 244 |
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 245 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
| 246 |
+
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
| 247 |
+
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 248 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 249 |
}
|
| 250 |
|