Haus1 commited on
Commit
04b01d8
·
1 Parent(s): 4e38ed4

AMD: parse the architecture as supplied by gcnArchName (llama/11244)

Browse files

The value provided by minor doesn't include stepping for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID.

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -46,20 +46,20 @@
46
  #define GGML_CUDA_CC_VOLTA 700
47
  #define GGML_CUDA_CC_TURING 750
48
  #define GGML_CUDA_CC_AMPERE 800
49
- #define GGML_CUDA_CC_OFFSET_AMD 1000000
50
 
51
  // GCN/CNDA, wave size is 64
52
- #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53
- #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54
- #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55
- #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56
- #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57
- #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
58
 
59
  // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60
- #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
61
- #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62
- #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
63
 
64
  #define GGML_CUDA_CC_QY1 210
65
  #define GGML_CUDA_CC_QY2 220
 
46
  #define GGML_CUDA_CC_VOLTA 700
47
  #define GGML_CUDA_CC_TURING 750
48
  #define GGML_CUDA_CC_AMPERE 800
49
+ #define GGML_CUDA_CC_OFFSET_AMD 0x1000000
50
 
51
  // GCN/CNDA, wave size is 64
52
+ #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
53
+ #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
54
+ #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
55
+ #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
56
+ #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
57
+ #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
58
 
59
  // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60
+ #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
61
+ #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
62
+ #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
63
 
64
  #define GGML_CUDA_CC_QY1 210
65
  #define GGML_CUDA_CC_QY2 220
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -119,6 +119,55 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
119
  #endif
120
  }
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  static ggml_cuda_device_info ggml_cuda_init() {
123
  #ifdef __HIP_PLATFORM_AMD__
124
  // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -169,7 +218,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
169
 
170
  cudaDeviceProp prop;
171
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
172
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
173
 
174
  info.default_tensor_split[id] = total_vram;
175
  total_vram += prop.totalGlobalMem;
@@ -178,10 +226,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
178
  info.devices[id].smpb = prop.sharedMemPerBlock;
179
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
180
  info.devices[id].smpbo = prop.sharedMemPerBlock;
181
- info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  #else
183
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
184
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
 
 
185
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
186
  }
187
 
 
119
  #endif
120
  }
121
 
122
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
123
+ static int ggml_cuda_parse_id(char devName[]) {
124
+ // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
125
+ // these values are not stable so this is susceptible to breakage
126
+ // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
127
+ int archMajor = 0x0;
128
+ int archMinor = 0x0;
129
+ int archNum = GGML_CUDA_CC_OFFSET_AMD;
130
+ int archLen = strlen(devName);
131
+ char archName[archLen + 1];
132
+
133
+ // strip leading 'gfx' while copying into our buffer
134
+ if (archLen > 3) {
135
+ strcpy(archName, &devName[3]);
136
+ archLen -= 3;
137
+ }
138
+
139
+ // trim trailing :xnack- or :sramecc- statuses
140
+ archLen = strcspn(archName, ":");
141
+ archName[archLen] = '\0';
142
+
143
+ // tease out the version information
144
+ if (archLen > 8) {
145
+ // versions labeled generic use '-' as delimiter
146
+ // strip the trailing "-generic" then iterate through what remains
147
+ if ((strstr(archName, "-generic"))) {
148
+ archName[archLen - 8] = '\0';
149
+ char * pch;
150
+ if ((pch = strtok(archName, "-"))) {
151
+ archMajor = (int)strtoul(pch, 0, 16);
152
+ if ((pch = strtok(NULL, "-"))) {
153
+ archMinor = 0x10 * (int)strtoul(pch, 0, 16);
154
+ }
155
+ }
156
+ }
157
+ } else if (archLen >= 3) {
158
+ // last two digits should be the minor * 0x10 + stepping
159
+ archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
160
+ archName[archLen - 2] = '\0';
161
+
162
+ // only the major version remains
163
+ archMajor = (int)strtoul(archName, 0, 16);
164
+ }
165
+ archNum += archMajor * 0x100;
166
+ archNum += archMinor;
167
+ return archNum;
168
+ }
169
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
170
+
171
  static ggml_cuda_device_info ggml_cuda_init() {
172
  #ifdef __HIP_PLATFORM_AMD__
173
  // Workaround for a rocBLAS bug when using multiple graphics cards:
 
218
 
219
  cudaDeviceProp prop;
220
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
 
221
 
222
  info.default_tensor_split[id] = total_vram;
223
  total_vram += prop.totalGlobalMem;
 
226
  info.devices[id].smpb = prop.sharedMemPerBlock;
227
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
228
  info.devices[id].smpbo = prop.sharedMemPerBlock;
229
+
230
+ info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
231
+ if ((info.devices[id].cc & 0xff00) == 0x0) {
232
+ GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
233
+ id, prop.name, prop.gcnArchName, prop.major, prop.minor);
234
+
235
+ // Fallback to prop.major and prop.minor
236
+ if (prop.major > 0) {
237
+ info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
238
+ info.devices[id].cc += prop.minor * 0x10;
239
+ }
240
+ }
241
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
242
+ id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
243
  #else
244
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
245
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
246
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
247
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
248
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
249
  }
250