Spaces:
Running
Running
bench : multi-thread memcpy (#1534)
Browse files- whisper.cpp +92 -4
whisper.cpp
CHANGED
|
@@ -6064,6 +6064,43 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6064 |
// 1GB array
|
| 6065 |
const size_t size = arr*1e6;
|
| 6066 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6067 |
// single-thread
|
| 6068 |
{
|
| 6069 |
char * src = (char *) malloc(size);
|
|
@@ -6074,7 +6111,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6074 |
memcpy(dst, src, size); // heat-up
|
| 6075 |
|
| 6076 |
double tsum = 0.0;
|
| 6077 |
-
double sum = 0.0;
|
| 6078 |
|
| 6079 |
for (size_t i = 0; i < n; i++) {
|
| 6080 |
const int64_t t0 = ggml_time_us();
|
|
@@ -6088,21 +6124,73 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6088 |
src[rand() % size] = rand() % 256;
|
| 6089 |
}
|
| 6090 |
|
| 6091 |
-
snprintf(strbuf, sizeof(strbuf), "memcpy:
|
| 6092 |
s += strbuf;
|
| 6093 |
|
| 6094 |
// needed to prevent the compiler from optimizing the memcpy away
|
| 6095 |
{
|
| 6096 |
for (size_t i = 0; i < size; i++) sum += dst[i];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6097 |
|
| 6098 |
-
|
| 6099 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6100 |
}
|
| 6101 |
|
| 6102 |
free(src);
|
| 6103 |
free(dst);
|
| 6104 |
}
|
| 6105 |
|
|
|
|
|
|
|
|
|
|
| 6106 |
return s.c_str();
|
| 6107 |
}
|
| 6108 |
|
|
|
|
| 6064 |
// 1GB array
|
| 6065 |
const size_t size = arr*1e6;
|
| 6066 |
|
| 6067 |
+
double sum = 0.0;
|
| 6068 |
+
|
| 6069 |
+
// heat-up
|
| 6070 |
+
{
|
| 6071 |
+
char * src = (char *) malloc(size);
|
| 6072 |
+
char * dst = (char *) malloc(size);
|
| 6073 |
+
|
| 6074 |
+
for (size_t i = 0; i < size; i++) src[i] = i;
|
| 6075 |
+
|
| 6076 |
+
memcpy(dst, src, size); // heat-up
|
| 6077 |
+
|
| 6078 |
+
double tsum = 0.0;
|
| 6079 |
+
|
| 6080 |
+
for (size_t i = 0; i < n; i++) {
|
| 6081 |
+
const int64_t t0 = ggml_time_us();
|
| 6082 |
+
|
| 6083 |
+
memcpy(dst, src, size);
|
| 6084 |
+
|
| 6085 |
+
const int64_t t1 = ggml_time_us();
|
| 6086 |
+
|
| 6087 |
+
tsum += (t1 - t0)*1e-6;
|
| 6088 |
+
|
| 6089 |
+
src[rand() % size] = rand() % 256;
|
| 6090 |
+
}
|
| 6091 |
+
|
| 6092 |
+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
|
| 6093 |
+
s += strbuf;
|
| 6094 |
+
|
| 6095 |
+
// needed to prevent the compiler from optimizing the memcpy away
|
| 6096 |
+
{
|
| 6097 |
+
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 6098 |
+
}
|
| 6099 |
+
|
| 6100 |
+
free(src);
|
| 6101 |
+
free(dst);
|
| 6102 |
+
}
|
| 6103 |
+
|
| 6104 |
// single-thread
|
| 6105 |
{
|
| 6106 |
char * src = (char *) malloc(size);
|
|
|
|
| 6111 |
memcpy(dst, src, size); // heat-up
|
| 6112 |
|
| 6113 |
double tsum = 0.0;
|
|
|
|
| 6114 |
|
| 6115 |
for (size_t i = 0; i < n; i++) {
|
| 6116 |
const int64_t t0 = ggml_time_us();
|
|
|
|
| 6124 |
src[rand() % size] = rand() % 256;
|
| 6125 |
}
|
| 6126 |
|
| 6127 |
+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
|
| 6128 |
s += strbuf;
|
| 6129 |
|
| 6130 |
// needed to prevent the compiler from optimizing the memcpy away
|
| 6131 |
{
|
| 6132 |
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 6133 |
+
}
|
| 6134 |
+
|
| 6135 |
+
free(src);
|
| 6136 |
+
free(dst);
|
| 6137 |
+
}
|
| 6138 |
+
|
| 6139 |
+
// multi-thread
|
| 6140 |
+
|
| 6141 |
+
for (uint32_t n_threads = 1; n_threads <= std::thread::hardware_concurrency(); n_threads++) {
|
| 6142 |
+
char * src = (char *) malloc(size);
|
| 6143 |
+
char * dst = (char *) malloc(size);
|
| 6144 |
+
|
| 6145 |
+
for (size_t i = 0; i < size; i++) src[i] = i;
|
| 6146 |
+
|
| 6147 |
+
memcpy(dst, src, size); // heat-up
|
| 6148 |
+
|
| 6149 |
+
double tsum = 0.0;
|
| 6150 |
+
|
| 6151 |
+
auto helper = [&](int th) {
|
| 6152 |
+
const int64_t i0 = (th + 0)*size/n_threads;
|
| 6153 |
+
const int64_t i1 = (th + 1)*size/n_threads;
|
| 6154 |
+
|
| 6155 |
+
for (size_t i = 0; i < n; i++) {
|
| 6156 |
+
memcpy(dst + i0, src + i0, i1 - i0);
|
| 6157 |
|
| 6158 |
+
src[i0 + rand() % (i1 - i0)] = rand() % 256;
|
| 6159 |
+
};
|
| 6160 |
+
};
|
| 6161 |
+
|
| 6162 |
+
const int64_t t0 = ggml_time_us();
|
| 6163 |
+
|
| 6164 |
+
std::vector<std::thread> threads(n_threads - 1);
|
| 6165 |
+
for (uint32_t th = 0; th < n_threads - 1; ++th) {
|
| 6166 |
+
threads[th] = std::thread(helper, th);
|
| 6167 |
+
}
|
| 6168 |
+
|
| 6169 |
+
helper(n_threads - 1);
|
| 6170 |
+
|
| 6171 |
+
for (uint32_t th = 0; th < n_threads - 1; ++th) {
|
| 6172 |
+
threads[th].join();
|
| 6173 |
+
}
|
| 6174 |
+
|
| 6175 |
+
const int64_t t1 = ggml_time_us();
|
| 6176 |
+
|
| 6177 |
+
tsum += (t1 - t0)*1e-6;
|
| 6178 |
+
|
| 6179 |
+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), n_threads);
|
| 6180 |
+
s += strbuf;
|
| 6181 |
+
|
| 6182 |
+
// needed to prevent the compiler from optimizing the memcpy away
|
| 6183 |
+
{
|
| 6184 |
+
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 6185 |
}
|
| 6186 |
|
| 6187 |
free(src);
|
| 6188 |
free(dst);
|
| 6189 |
}
|
| 6190 |
|
| 6191 |
+
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
|
| 6192 |
+
s += strbuf;
|
| 6193 |
+
|
| 6194 |
return s.c_str();
|
| 6195 |
}
|
| 6196 |
|