ggerganov commited on
Commit
c0ee6b3
·
unverified ·
1 Parent(s): 3217f72

bench : multi-thread memcpy (#1534)

Browse files
Files changed (1) hide show
  1. whisper.cpp +92 -4
whisper.cpp CHANGED
@@ -6064,6 +6064,43 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6064
  // 1GB array
6065
  const size_t size = arr*1e6;
6066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6067
  // single-thread
6068
  {
6069
  char * src = (char *) malloc(size);
@@ -6074,7 +6111,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6074
  memcpy(dst, src, size); // heat-up
6075
 
6076
  double tsum = 0.0;
6077
- double sum = 0.0;
6078
 
6079
  for (size_t i = 0; i < n; i++) {
6080
  const int64_t t0 = ggml_time_us();
@@ -6088,21 +6124,73 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6088
  src[rand() % size] = rand() % 256;
6089
  }
6090
 
6091
- snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
6092
  s += strbuf;
6093
 
6094
  // needed to prevent the compiler from optimizing the memcpy away
6095
  {
6096
  for (size_t i = 0; i < size; i++) sum += dst[i];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6097
 
6098
- snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
6099
- s += strbuf;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6100
  }
6101
 
6102
  free(src);
6103
  free(dst);
6104
  }
6105
 
 
 
 
6106
  return s.c_str();
6107
  }
6108
 
 
6064
  // 1GB array
6065
  const size_t size = arr*1e6;
6066
 
6067
+ double sum = 0.0;
6068
+
6069
+ // heat-up
6070
+ {
6071
+ char * src = (char *) malloc(size);
6072
+ char * dst = (char *) malloc(size);
6073
+
6074
+ for (size_t i = 0; i < size; i++) src[i] = i;
6075
+
6076
+ memcpy(dst, src, size); // heat-up
6077
+
6078
+ double tsum = 0.0;
6079
+
6080
+ for (size_t i = 0; i < n; i++) {
6081
+ const int64_t t0 = ggml_time_us();
6082
+
6083
+ memcpy(dst, src, size);
6084
+
6085
+ const int64_t t1 = ggml_time_us();
6086
+
6087
+ tsum += (t1 - t0)*1e-6;
6088
+
6089
+ src[rand() % size] = rand() % 256;
6090
+ }
6091
+
6092
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
6093
+ s += strbuf;
6094
+
6095
+ // needed to prevent the compiler from optimizing the memcpy away
6096
+ {
6097
+ for (size_t i = 0; i < size; i++) sum += dst[i];
6098
+ }
6099
+
6100
+ free(src);
6101
+ free(dst);
6102
+ }
6103
+
6104
  // single-thread
6105
  {
6106
  char * src = (char *) malloc(size);
 
6111
  memcpy(dst, src, size); // heat-up
6112
 
6113
  double tsum = 0.0;
 
6114
 
6115
  for (size_t i = 0; i < n; i++) {
6116
  const int64_t t0 = ggml_time_us();
 
6124
  src[rand() % size] = rand() % 256;
6125
  }
6126
 
6127
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
6128
  s += strbuf;
6129
 
6130
  // needed to prevent the compiler from optimizing the memcpy away
6131
  {
6132
  for (size_t i = 0; i < size; i++) sum += dst[i];
6133
+ }
6134
+
6135
+ free(src);
6136
+ free(dst);
6137
+ }
6138
+
6139
+ // multi-thread
6140
+
6141
+ for (uint32_t n_threads = 1; n_threads <= std::thread::hardware_concurrency(); n_threads++) {
6142
+ char * src = (char *) malloc(size);
6143
+ char * dst = (char *) malloc(size);
6144
+
6145
+ for (size_t i = 0; i < size; i++) src[i] = i;
6146
+
6147
+ memcpy(dst, src, size); // heat-up
6148
+
6149
+ double tsum = 0.0;
6150
+
6151
+ auto helper = [&](int th) {
6152
+ const int64_t i0 = (th + 0)*size/n_threads;
6153
+ const int64_t i1 = (th + 1)*size/n_threads;
6154
+
6155
+ for (size_t i = 0; i < n; i++) {
6156
+ memcpy(dst + i0, src + i0, i1 - i0);
6157
 
6158
+ src[i0 + rand() % (i1 - i0)] = rand() % 256;
6159
+ };
6160
+ };
6161
+
6162
+ const int64_t t0 = ggml_time_us();
6163
+
6164
+ std::vector<std::thread> threads(n_threads - 1);
6165
+ for (uint32_t th = 0; th < n_threads - 1; ++th) {
6166
+ threads[th] = std::thread(helper, th);
6167
+ }
6168
+
6169
+ helper(n_threads - 1);
6170
+
6171
+ for (uint32_t th = 0; th < n_threads - 1; ++th) {
6172
+ threads[th].join();
6173
+ }
6174
+
6175
+ const int64_t t1 = ggml_time_us();
6176
+
6177
+ tsum += (t1 - t0)*1e-6;
6178
+
6179
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), n_threads);
6180
+ s += strbuf;
6181
+
6182
+ // needed to prevent the compiler from optimizing the memcpy away
6183
+ {
6184
+ for (size_t i = 0; i < size; i++) sum += dst[i];
6185
  }
6186
 
6187
  free(src);
6188
  free(dst);
6189
  }
6190
 
6191
+ snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
6192
+ s += strbuf;
6193
+
6194
  return s.c_str();
6195
  }
6196