Spaces:
Running
Running
parallel : print time of audio boundaries + fix timings
Browse files- whisper.cpp +38 -12
- whisper.h +3 -0
whisper.cpp
CHANGED
|
@@ -1910,14 +1910,19 @@ whisper_vocab::id whisper_sample_timestamp(
|
|
| 1910 |
return probs_id[0].second;
|
| 1911 |
}
|
| 1912 |
|
| 1913 |
-
|
| 1914 |
-
|
| 1915 |
-
|
| 1916 |
-
int64_t
|
| 1917 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1918 |
|
| 1919 |
char buf[32];
|
| 1920 |
-
snprintf(buf, sizeof(buf), "%02d:%02d
|
| 1921 |
|
| 1922 |
return std::string(buf);
|
| 1923 |
}
|
|
@@ -2727,24 +2732,45 @@ int whisper_full_parallel(
|
|
| 2727 |
|
| 2728 |
// combine results into ctx->result_all
|
| 2729 |
for (int i = 0; i < n_processors - 1; ++i) {
|
| 2730 |
-
auto &
|
| 2731 |
|
| 2732 |
-
for (int j = 0; j < (int)
|
| 2733 |
-
|
| 2734 |
-
|
|
|
|
| 2735 |
|
|
|
|
| 2736 |
if (ctx->result_all.size() > 0) {
|
| 2737 |
-
|
| 2738 |
}
|
| 2739 |
|
| 2740 |
-
ctx->result_all.push_back(std::move(
|
| 2741 |
|
| 2742 |
// call the new_segment_callback for each segment
|
| 2743 |
if (params.new_segment_callback) {
|
| 2744 |
params.new_segment_callback(ctx, params.new_segment_callback_user_data);
|
| 2745 |
}
|
| 2746 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2747 |
}
|
|
|
|
| 2748 |
|
| 2749 |
return ret;
|
| 2750 |
}
|
|
|
|
| 1910 |
return probs_id[0].second;
|
| 1911 |
}
|
| 1912 |
|
| 1913 |
+
// 500 -> 00:05.000
|
| 1914 |
+
// 6000 -> 01:00.000
|
| 1915 |
+
std::string to_timestamp(int64_t t, bool comma = false) {
|
| 1916 |
+
int64_t msec = t * 10;
|
| 1917 |
+
int64_t hr = msec / (1000 * 60 * 60);
|
| 1918 |
+
msec = msec - hr * (1000 * 60 * 60);
|
| 1919 |
+
int64_t min = msec / (1000 * 60);
|
| 1920 |
+
msec = msec - min * (1000 * 60);
|
| 1921 |
+
int64_t sec = msec / 1000;
|
| 1922 |
+
msec = msec - sec * 1000;
|
| 1923 |
|
| 1924 |
char buf[32];
|
| 1925 |
+
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
|
| 1926 |
|
| 1927 |
return std::string(buf);
|
| 1928 |
}
|
|
|
|
| 2732 |
|
| 2733 |
// combine results into ctx->result_all
|
| 2734 |
for (int i = 0; i < n_processors - 1; ++i) {
|
| 2735 |
+
auto & results_i = ctxs[i].result_all;
|
| 2736 |
|
| 2737 |
+
for (int j = 0; j < (int) results_i.size(); ++j) {
|
| 2738 |
+
// correct the segment timestamp taking into account the offset
|
| 2739 |
+
results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
|
| 2740 |
+
results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
|
| 2741 |
|
| 2742 |
+
// make sure that segments are not overlapping
|
| 2743 |
if (ctx->result_all.size() > 0) {
|
| 2744 |
+
results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
|
| 2745 |
}
|
| 2746 |
|
| 2747 |
+
ctx->result_all.push_back(std::move(results_i[j]));
|
| 2748 |
|
| 2749 |
// call the new_segment_callback for each segment
|
| 2750 |
if (params.new_segment_callback) {
|
| 2751 |
params.new_segment_callback(ctx, params.new_segment_callback_user_data);
|
| 2752 |
}
|
| 2753 |
}
|
| 2754 |
+
|
| 2755 |
+
ctx->t_mel_us += ctxs[i].t_mel_us;
|
| 2756 |
+
ctx->t_sample_us += ctxs[i].t_sample_us;
|
| 2757 |
+
ctx->t_encode_us += ctxs[i].t_encode_us;
|
| 2758 |
+
ctx->t_decode_us += ctxs[i].t_decode_us;
|
| 2759 |
+
}
|
| 2760 |
+
|
| 2761 |
+
// average the timings
|
| 2762 |
+
ctx->t_mel_us /= n_processors;
|
| 2763 |
+
ctx->t_sample_us /= n_processors;
|
| 2764 |
+
ctx->t_encode_us /= n_processors;
|
| 2765 |
+
ctx->t_decode_us /= n_processors;
|
| 2766 |
+
|
| 2767 |
+
// print information about the audio boundaries
|
| 2768 |
+
fprintf(stderr, "\n");
|
| 2769 |
+
fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
|
| 2770 |
+
for (int i = 0; i < n_processors - 1; ++i) {
|
| 2771 |
+
fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
|
| 2772 |
}
|
| 2773 |
+
fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
|
| 2774 |
|
| 2775 |
return ret;
|
| 2776 |
}
|
whisper.h
CHANGED
|
@@ -213,6 +213,9 @@ extern "C" {
|
|
| 213 |
const float * samples,
|
| 214 |
int n_samples);
|
| 215 |
|
|
|
|
|
|
|
|
|
|
| 216 |
WHISPER_API int whisper_full_parallel(
|
| 217 |
struct whisper_context * ctx,
|
| 218 |
struct whisper_full_params params,
|
|
|
|
| 213 |
const float * samples,
|
| 214 |
int n_samples);
|
| 215 |
|
| 216 |
+
// Split the input audio in chunks and process each chunk separately using whisper_full()
|
| 217 |
+
// It seems this approach can offer some speedup in some cases.
|
| 218 |
+
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
| 219 |
WHISPER_API int whisper_full_parallel(
|
| 220 |
struct whisper_context * ctx,
|
| 221 |
struct whisper_full_params params,
|