Spaces:

natasa365
/

whisper.cpp

Sleeping

whisper.cpp / bindings /go /pkg /whisper /interface.go

Amanda Der Bedrosian

bindings.go : add DetectedLanguage to go bindings (#2947)

1830e27 unverified 9 months ago

3.83 kB

	package whisper

	import (
	"io"
	"time"
	)

	///////////////////////////////////////////////////////////////////////////////
	// TYPES

	// SegmentCallback is the callback function for processing segments in real
	// time. It is called during the Process function
	type SegmentCallback func(Segment)

	// ProgressCallback is the callback function for reporting progress during
	// processing. It is called during the Process function
	type ProgressCallback func(int)

	// EncoderBeginCallback is the callback function for checking if we want to
	// continue processing. It is called during the Process function
	type EncoderBeginCallback func() bool

	// Model is the interface to a whisper model. Create a new model with the
	// function whisper.New(string)
	type Model interface {
	io.Closer

	// Return a new speech-to-text context.
	NewContext() (Context, error)

	// Return true if the model is multilingual.
	IsMultilingual() bool

	// Return all languages supported.
	Languages() []string
	}

	// Context is the speech recognition context.
	type Context interface {
	SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
	SetTranslate(bool) // Set translate flag
	IsMultilingual() bool // Return true if the model is multilingual.
	Language() string // Get language
	DetectedLanguage() string // Get detected language

	SetOffset(time.Duration) // Set offset
	SetDuration(time.Duration) // Set duration
	SetThreads(uint) // Set number of threads to use
	SetSplitOnWord(bool) // Set split on word flag
	SetTokenThreshold(float32) // Set timestamp token probability threshold
	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
	SetMaxSegmentLength(uint) // Set max segment length in characters
	SetTokenTimestamps(bool) // Set token timestamps flag
	SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
	SetAudioCtx(uint) // Set audio encoder context
	SetMaxContext(n int) // Set maximum number of text context tokens to store
	SetBeamSize(n int) // Set Beam Size
	SetEntropyThold(t float32) // Set Entropy threshold
	SetInitialPrompt(prompt string) // Set initial prompt
	SetTemperature(t float32) // Set temperature
	SetTemperatureFallback(t float32) // Set temperature incrementation

	// Process mono audio data and return any errors.
	// If defined, newly generated segments are passed to the
	// callback function during processing.
	Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error

	// After process is called, return segments until the end of the stream
	// is reached, when io.EOF is returned.
	NextSegment() (Segment, error)

	IsBEG(Token) bool // Test for "begin" token
	IsSOT(Token) bool // Test for "start of transcription" token
	IsEOT(Token) bool // Test for "end of transcription" token
	IsPREV(Token) bool // Test for "start of prev" token
	IsSOLM(Token) bool // Test for "start of lm" token
	IsNOT(Token) bool // Test for "No timestamps" token
	IsLANG(Token, string) bool // Test for token associated with a specific language
	IsText(Token) bool // Test for text token

	// Timings
	PrintTimings()
	ResetTimings()

	SystemInfo() string
	}

	// Segment is the text result of a speech recognition.
	type Segment struct {
	// Segment Number
	Num int

	// Time beginning and end timestamps for the segment.
	Start, End time.Duration

	// The text of the segment.
	Text string

	// The tokens of the segment.
	Tokens []Token
	}

	// Token is a text or special token
	type Token struct {
	Id int
	Text string
	P float32
	Start, End time.Duration
	}