ggerganov commited on
Commit
11bb554
·
unverified ·
1 Parent(s): 8d1f7e9

whisper.objc : add real-time processing (#97)

Browse files
examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard CHANGED
@@ -1,8 +1,8 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
- <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
3
  <device id="retina6_0" orientation="portrait" appearance="light"/>
4
  <dependencies>
5
- <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
6
  <capability name="Safe area layout guides" minToolsVersion="9.0"/>
7
  <capability name="System colors in document resources" minToolsVersion="11.0"/>
8
  <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@@ -40,7 +40,7 @@
40
  <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
41
  <color key="backgroundColor" systemColor="systemBackgroundColor"/>
42
  <color key="textColor" systemColor="labelColor"/>
43
- <fontDescription key="fontDescription" type="system" pointSize="20"/>
44
  <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
45
  </textView>
46
  <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@@ -56,6 +56,18 @@
56
  <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
57
  </connections>
58
  </button>
 
 
 
 
 
 
 
 
 
 
 
 
59
  </subviews>
60
  <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
61
  <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@@ -64,6 +76,7 @@
64
  </constraints>
65
  </view>
66
  <connections>
 
67
  <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
68
  <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
69
  <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
+ <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
3
  <device id="retina6_0" orientation="portrait" appearance="light"/>
4
  <dependencies>
5
+ <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
6
  <capability name="Safe area layout guides" minToolsVersion="9.0"/>
7
  <capability name="System colors in document resources" minToolsVersion="11.0"/>
8
  <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
 
40
  <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
41
  <color key="backgroundColor" systemColor="systemBackgroundColor"/>
42
  <color key="textColor" systemColor="labelColor"/>
43
+ <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
44
  <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
45
  </textView>
46
  <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
 
56
  <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
57
  </connections>
58
  </button>
59
+ <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
60
+ <rect key="frame" x="199" y="191" width="156" height="49"/>
61
+ <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
62
+ <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
63
+ <color key="tintColor" systemColor="opaqueSeparatorColor"/>
64
+ <state key="normal" title="Real-time">
65
+ <color key="titleColor" systemColor="labelColor"/>
66
+ </state>
67
+ <connections>
68
+ <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
69
+ </connections>
70
+ </button>
71
  </subviews>
72
  <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
73
  <color key="backgroundColor" systemColor="systemBackgroundColor"/>
 
76
  </constraints>
77
  </view>
78
  <connections>
79
+ <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
80
  <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
81
  <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
82
  <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
examples/whisper.objc/whisper.objc/ViewController.h CHANGED
@@ -20,6 +20,8 @@ typedef struct
20
  {
21
  int ggwaveId;
22
  bool isCapturing;
 
 
23
  UILabel * labelReceived;
24
 
25
  AudioQueueRef queue;
@@ -31,6 +33,8 @@ typedef struct
31
  float * audioBufferF32;
32
 
33
  struct whisper_context * ctx;
 
 
34
  } StateInp;
35
 
36
  @interface ViewController : UIViewController
 
20
  {
21
  int ggwaveId;
22
  bool isCapturing;
23
+ bool isTranscribing;
24
+ bool isRealtime;
25
  UILabel * labelReceived;
26
 
27
  AudioQueueRef queue;
 
33
  float * audioBufferF32;
34
 
35
  struct whisper_context * ctx;
36
+
37
+ void * vc;
38
  } StateInp;
39
 
40
  @interface ViewController : UIViewController
examples/whisper.objc/whisper.objc/ViewController.m CHANGED
@@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
24
  @property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
25
  @property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
26
  @property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
 
27
  @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
28
 
29
  @end
@@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
77
  stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
78
  stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
79
  }
 
 
 
80
  }
81
 
82
  -(IBAction) stopCapturing {
@@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
109
  NSLog(@"Start capturing");
110
 
111
  stateInp.n_samples = 0;
 
112
 
113
  OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
114
  AudioInputCallback,
@@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
141
  - (IBAction)onTranscribePrepare:(id)sender {
142
  _textviewResult.text = @"Processing - please wait ...";
143
 
 
 
 
 
144
  if (stateInp.isCapturing) {
145
- // stop capturing
146
  [self stopCapturing];
 
 
147
 
148
- return;
 
 
 
 
 
 
149
  }
 
 
150
  }
151
 
152
  - (IBAction)onTranscribe:(id)sender {
 
 
 
 
153
  NSLog(@"Processing %d samples", stateInp.n_samples);
154
 
155
- // process captured audio
156
- // convert I16 to F32
157
- for (int i = 0; i < stateInp.n_samples; i++) {
158
- stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
159
- }
 
 
 
 
160
 
161
- // run the model
162
- struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
163
 
164
- params.print_realtime = true;
165
- params.print_progress = false;
166
- params.print_timestamps = true;
167
- params.print_special = false;
168
- params.translate = false;
169
- params.language = "en";
170
- params.n_threads = 4;
171
- params.offset_ms = 0;
172
 
173
- CFTimeInterval startTime = CACurrentMediaTime();
 
 
 
 
 
 
 
 
174
 
175
- if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
176
- NSLog(@"Failed to run the model");
177
- _textviewResult.text = @"Failed to run the model";
178
 
179
- return;
180
- }
181
 
182
- CFTimeInterval endTime = CACurrentMediaTime();
 
 
183
 
184
- // clear the text in the textview
185
- _textviewResult.text = @"";
186
 
187
- int n_segments = whisper_full_n_segments(stateInp.ctx);
188
- for (int i = 0; i < n_segments; i++) {
189
- const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
190
 
191
- // append the text to the textview
192
- _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
193
- }
194
 
195
- // internal model timing
196
- whisper_print_timings(stateInp.ctx);
197
 
198
- NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
 
 
199
 
200
- _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
 
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
 
203
  //
204
- // Callback implmentation
205
  //
206
 
207
  void AudioInputCallback(void * inUserData,
@@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
224
 
225
  if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
226
  NSLog(@"Too much audio data, ignoring");
 
 
 
 
 
 
227
  return;
228
  }
229
 
@@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
235
 
236
  // put the buffer back in the queue
237
  AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
 
 
 
 
 
 
 
 
238
  }
239
 
240
  @end
 
24
  @property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
25
  @property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
26
  @property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
27
+ @property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
28
  @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
29
 
30
  @end
 
78
  stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
79
  stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
80
  }
81
+
82
+ stateInp.isTranscribing = false;
83
+ stateInp.isRealtime = false;
84
  }
85
 
86
  -(IBAction) stopCapturing {
 
113
  NSLog(@"Start capturing");
114
 
115
  stateInp.n_samples = 0;
116
+ stateInp.vc = (__bridge void *)(self);
117
 
118
  OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
119
  AudioInputCallback,
 
146
  - (IBAction)onTranscribePrepare:(id)sender {
147
  _textviewResult.text = @"Processing - please wait ...";
148
 
149
+ if (stateInp.isRealtime) {
150
+ [self onRealtime:(id)sender];
151
+ }
152
+
153
  if (stateInp.isCapturing) {
 
154
  [self stopCapturing];
155
+ }
156
+ }
157
 
158
+ - (IBAction)onRealtime:(id)sender {
159
+ stateInp.isRealtime = !stateInp.isRealtime;
160
+
161
+ if (stateInp.isRealtime) {
162
+ [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
163
+ } else {
164
+ [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
165
  }
166
+
167
+ NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
168
  }
169
 
170
  - (IBAction)onTranscribe:(id)sender {
171
+ if (stateInp.isTranscribing) {
172
+ return;
173
+ }
174
+
175
  NSLog(@"Processing %d samples", stateInp.n_samples);
176
 
177
+ stateInp.isTranscribing = true;
178
+
179
+ // dispatch the model to a background thread
180
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
181
+ // process captured audio
182
+ // convert I16 to F32
183
+ for (int i = 0; i < self->stateInp.n_samples; i++) {
184
+ self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
185
+ }
186
 
187
+ // run the model
188
+ struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
189
 
190
+ // get maximum number of threads on this device (max 8)
191
+ const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
 
 
 
 
 
 
192
 
193
+ params.print_realtime = true;
194
+ params.print_progress = false;
195
+ params.print_timestamps = true;
196
+ params.print_special = false;
197
+ params.translate = false;
198
+ params.language = "en";
199
+ params.n_threads = max_threads;
200
+ params.offset_ms = 0;
201
+ params.single_segment = self->stateInp.isRealtime;
202
 
203
+ CFTimeInterval startTime = CACurrentMediaTime();
 
 
204
 
205
+ whisper_reset_timings(self->stateInp.ctx);
 
206
 
207
+ if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
208
+ NSLog(@"Failed to run the model");
209
+ self->_textviewResult.text = @"Failed to run the model";
210
 
211
+ return;
212
+ }
213
 
214
+ whisper_print_timings(self->stateInp.ctx);
 
 
215
 
216
+ CFTimeInterval endTime = CACurrentMediaTime();
217
+
218
+ NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
219
 
220
+ // result text
221
+ NSString *result = @"";
222
 
223
+ int n_segments = whisper_full_n_segments(self->stateInp.ctx);
224
+ for (int i = 0; i < n_segments; i++) {
225
+ const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
226
 
227
+ // append the text to the result
228
+ result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
229
+ }
230
+
231
+ // append processing time
232
+ result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
233
+
234
+ // dispatch the result to the main thread
235
+ dispatch_async(dispatch_get_main_queue(), ^{
236
+ self->_textviewResult.text = result;
237
+ self->stateInp.isTranscribing = false;
238
+ });
239
+ });
240
  }
241
 
242
  //
243
+ // Callback implementation
244
  //
245
 
246
  void AudioInputCallback(void * inUserData,
 
263
 
264
  if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
265
  NSLog(@"Too much audio data, ignoring");
266
+
267
+ dispatch_async(dispatch_get_main_queue(), ^{
268
+ ViewController * vc = (__bridge ViewController *)(stateInp->vc);
269
+ [vc stopCapturing];
270
+ });
271
+
272
  return;
273
  }
274
 
 
280
 
281
  // put the buffer back in the queue
282
  AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
283
+
284
+ if (stateInp->isRealtime) {
285
+ // dipatch onTranscribe() to the main thread
286
+ dispatch_async(dispatch_get_main_queue(), ^{
287
+ ViewController * vc = (__bridge ViewController *)(stateInp->vc);
288
+ [vc onTranscribe:nil];
289
+ });
290
+ }
291
  }
292
 
293
  @end
whisper.cpp CHANGED
@@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
2386
  ctx->t_decode_us = 0;
2387
  }
2388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2389
  ////////////////////////////////////////////////////////////////////////////
2390
 
2391
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@@ -2863,7 +2878,7 @@ int whisper_full_parallel(
2863
  struct whisper_full_params params,
2864
  const float * samples,
2865
  int n_samples,
2866
- const int n_processors) {
2867
  if (n_processors == 1) {
2868
  return whisper_full(ctx, params, samples, n_samples);
2869
  }
@@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
3040
  return ctx->result_all[i_segment].tokens[i_token].p;
3041
  }
3042
 
3043
- const char * whisper_print_system_info(void) {
3044
- static std::string s;
3045
-
3046
- s = "";
3047
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
3048
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
3049
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
3050
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
3051
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
3052
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
3053
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
3054
-
3055
- return s.c_str();
3056
- }
3057
-
3058
  // =================================================================================================
3059
 
3060
  //
 
2386
  ctx->t_decode_us = 0;
2387
  }
2388
 
2389
+ const char * whisper_print_system_info(void) {
2390
+ static std::string s;
2391
+
2392
+ s = "";
2393
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2394
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2395
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2396
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2397
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2398
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2399
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2400
+
2401
+ return s.c_str();
2402
+ }
2403
+
2404
  ////////////////////////////////////////////////////////////////////////////
2405
 
2406
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
 
2878
  struct whisper_full_params params,
2879
  const float * samples,
2880
  int n_samples,
2881
+ int n_processors) {
2882
  if (n_processors == 1) {
2883
  return whisper_full(ctx, params, samples, n_samples);
2884
  }
 
3055
  return ctx->result_all[i_segment].tokens[i_token].p;
3056
  }
3057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3058
  // =================================================================================================
3059
 
3060
  //
whisper.h CHANGED
@@ -72,16 +72,16 @@ extern "C" {
72
  whisper_token id; // token id
73
  whisper_token tid; // forced timestamp token id
74
 
75
- float p; // probability of the token
76
- float pt; // probability of the timestamp token
77
- float ptsum; // sum of probabilities of all timestamp tokens
78
 
79
  // token-level timestamp data
80
  // do not use if you haven't computed token-level timestamps
81
- int64_t t0; // start time of the token
82
- int64_t t1; // end time of the token
83
 
84
- float vlen; // voice length of the token
85
  } whisper_token_data;
86
 
87
  // Allocates all memory needed for the model and loads the model from the given file.
@@ -96,9 +96,9 @@ extern "C" {
96
  // Returns 0 on success
97
  WHISPER_API int whisper_pcm_to_mel(
98
  struct whisper_context * ctx,
99
- const float * samples,
100
- int n_samples,
101
- int n_threads);
102
 
103
  // This can be used to set a custom log mel spectrogram inside the provided whisper context.
104
  // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@@ -106,9 +106,9 @@ extern "C" {
106
  // Returns 0 on success
107
  WHISPER_API int whisper_set_mel(
108
  struct whisper_context * ctx,
109
- const float * data,
110
- int n_len,
111
- int n_mel);
112
 
113
  // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
114
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@@ -116,8 +116,8 @@ extern "C" {
116
  // Returns 0 on success
117
  WHISPER_API int whisper_encode(
118
  struct whisper_context * ctx,
119
- int offset,
120
- int n_threads);
121
 
122
  // Run the Whisper decoder to obtain the logits and probabilities for the next token.
123
  // Make sure to call whisper_encode() first.
@@ -126,10 +126,10 @@ extern "C" {
126
  // Returns 0 on success
127
  WHISPER_API int whisper_decode(
128
  struct whisper_context * ctx,
129
- const whisper_token * tokens,
130
- int n_tokens,
131
- int n_past,
132
- int n_threads);
133
 
134
  // Token sampling methods.
135
  // These are provided for convenience and can be used after each call to whisper_decode().
@@ -169,6 +169,9 @@ extern "C" {
169
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
170
  WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
171
 
 
 
 
172
  ////////////////////////////////////////////////////////////////////////////
173
 
174
  // Available sampling strategies
@@ -187,12 +190,12 @@ extern "C" {
187
 
188
  int n_threads;
189
  int n_max_text_ctx;
190
- int offset_ms; // start offset in ms
191
- int duration_ms; // audio duration to process in ms
192
 
193
  bool translate;
194
  bool no_context;
195
- bool single_segment; // force single segment output (useful for streaming)
196
  bool print_special;
197
  bool print_progress;
198
  bool print_realtime;
@@ -206,8 +209,8 @@ extern "C" {
206
  int max_tokens; // max tokens per segment (0 = no limit)
207
 
208
  // [EXPERIMENTAL] speed-up techniques
209
- bool speed_up; // speed-up the audio by 2x using Phase Vocoder
210
- int audio_ctx; // overwrite the audio context size (0 = use default)
211
 
212
  // tokens to provide the whisper model as initial prompt
213
  // these are prepended to any existing text context from a previous call
@@ -235,20 +238,20 @@ extern "C" {
235
  // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
236
  // Uses the specified decoding strategy to obtain the text.
237
  WHISPER_API int whisper_full(
238
- struct whisper_context * ctx,
239
- struct whisper_full_params params,
240
- const float * samples,
241
- int n_samples);
242
 
243
  // Split the input audio in chunks and process each chunk separately using whisper_full()
244
  // It seems this approach can offer some speedup in some cases.
245
  // However, the transcription accuracy can be worse at the beginning and end of each chunk.
246
  WHISPER_API int whisper_full_parallel(
247
- struct whisper_context * ctx,
248
- struct whisper_full_params params,
249
- const float * samples,
250
- int n_samples,
251
- const int n_processors);
252
 
253
  // Number of generated text segments.
254
  // A segment can be a few words, a sentence, or even a paragraph.
@@ -275,9 +278,6 @@ extern "C" {
275
  // Get the probability of the specified token in the specified segment.
276
  WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
277
 
278
- // Print system information
279
- WHISPER_API const char * whisper_print_system_info(void);
280
-
281
  #ifdef __cplusplus
282
  }
283
  #endif
 
72
  whisper_token id; // token id
73
  whisper_token tid; // forced timestamp token id
74
 
75
+ float p; // probability of the token
76
+ float pt; // probability of the timestamp token
77
+ float ptsum; // sum of probabilities of all timestamp tokens
78
 
79
  // token-level timestamp data
80
  // do not use if you haven't computed token-level timestamps
81
+ int64_t t0; // start time of the token
82
+ int64_t t1; // end time of the token
83
 
84
+ float vlen; // voice length of the token
85
  } whisper_token_data;
86
 
87
  // Allocates all memory needed for the model and loads the model from the given file.
 
96
  // Returns 0 on success
97
  WHISPER_API int whisper_pcm_to_mel(
98
  struct whisper_context * ctx,
99
+ const float * samples,
100
+ int n_samples,
101
+ int n_threads);
102
 
103
  // This can be used to set a custom log mel spectrogram inside the provided whisper context.
104
  // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
 
106
  // Returns 0 on success
107
  WHISPER_API int whisper_set_mel(
108
  struct whisper_context * ctx,
109
+ const float * data,
110
+ int n_len,
111
+ int n_mel);
112
 
113
  // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
114
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
 
116
  // Returns 0 on success
117
  WHISPER_API int whisper_encode(
118
  struct whisper_context * ctx,
119
+ int offset,
120
+ int n_threads);
121
 
122
  // Run the Whisper decoder to obtain the logits and probabilities for the next token.
123
  // Make sure to call whisper_encode() first.
 
126
  // Returns 0 on success
127
  WHISPER_API int whisper_decode(
128
  struct whisper_context * ctx,
129
+ const whisper_token * tokens,
130
+ int n_tokens,
131
+ int n_past,
132
+ int n_threads);
133
 
134
  // Token sampling methods.
135
  // These are provided for convenience and can be used after each call to whisper_decode().
 
169
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
170
  WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
171
 
172
+ // Print system information
173
+ WHISPER_API const char * whisper_print_system_info(void);
174
+
175
  ////////////////////////////////////////////////////////////////////////////
176
 
177
  // Available sampling strategies
 
190
 
191
  int n_threads;
192
  int n_max_text_ctx;
193
+ int offset_ms; // start offset in ms
194
+ int duration_ms; // audio duration to process in ms
195
 
196
  bool translate;
197
  bool no_context;
198
+ bool single_segment; // force single segment output (useful for streaming)
199
  bool print_special;
200
  bool print_progress;
201
  bool print_realtime;
 
209
  int max_tokens; // max tokens per segment (0 = no limit)
210
 
211
  // [EXPERIMENTAL] speed-up techniques
212
+ bool speed_up; // speed-up the audio by 2x using Phase Vocoder
213
+ int audio_ctx; // overwrite the audio context size (0 = use default)
214
 
215
  // tokens to provide the whisper model as initial prompt
216
  // these are prepended to any existing text context from a previous call
 
238
  // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
239
  // Uses the specified decoding strategy to obtain the text.
240
  WHISPER_API int whisper_full(
241
+ struct whisper_context * ctx,
242
+ struct whisper_full_params params,
243
+ const float * samples,
244
+ int n_samples);
245
 
246
  // Split the input audio in chunks and process each chunk separately using whisper_full()
247
  // It seems this approach can offer some speedup in some cases.
248
  // However, the transcription accuracy can be worse at the beginning and end of each chunk.
249
  WHISPER_API int whisper_full_parallel(
250
+ struct whisper_context * ctx,
251
+ struct whisper_full_params params,
252
+ const float * samples,
253
+ int n_samples,
254
+ int n_processors);
255
 
256
  // Number of generated text segments.
257
  // A segment can be a few words, a sentence, or even a paragraph.
 
278
  // Get the probability of the specified token in the specified segment.
279
  WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
280
 
 
 
 
281
  #ifdef __cplusplus
282
  }
283
  #endif