Spaces:
Sleeping
Sleeping
package whisper | |
import ( | |
"io" | |
"time" | |
) | |
/////////////////////////////////////////////////////////////////////////////// | |
// TYPES | |
// SegmentCallback is the callback function for processing segments in real | |
// time. It is called during the Process function | |
type SegmentCallback func(Segment) | |
// ProgressCallback is the callback function for reporting progress during | |
// processing. It is called during the Process function | |
type ProgressCallback func(int) | |
// Model is the interface to a whisper model. Create a new model with the | |
// function whisper.New(string) | |
type Model interface { | |
io.Closer | |
// Return a new speech-to-text context. | |
NewContext() (Context, error) | |
// Return true if the model is multilingual. | |
IsMultilingual() bool | |
// Return all languages supported. | |
Languages() []string | |
} | |
// Context is the speach recognition context. | |
type Context interface { | |
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language. | |
SetTranslate(bool) // Set translate flag | |
IsMultilingual() bool // Return true if the model is multilingual. | |
Language() string // Get language | |
SetOffset(time.Duration) // Set offset | |
SetDuration(time.Duration) // Set duration | |
SetThreads(uint) // Set number of threads to use | |
SetSplitOnWord(bool) // Set split on word flag | |
SetTokenThreshold(float32) // Set timestamp token probability threshold | |
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold | |
SetMaxSegmentLength(uint) // Set max segment length in characters | |
SetTokenTimestamps(bool) // Set token timestamps flag | |
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit) | |
SetAudioCtx(uint) // Set audio encoder context | |
SetMaxContext(n int) // Set maximum number of text context tokens to store | |
SetBeamSize(n int) // Set Beam Size | |
SetEntropyThold(t float32) // Set Entropy threshold | |
SetInitialPrompt(prompt string) // Set initial prompt | |
// Process mono audio data and return any errors. | |
// If defined, newly generated segments are passed to the | |
// callback function during processing. | |
Process([]float32, SegmentCallback, ProgressCallback) error | |
// After process is called, return segments until the end of the stream | |
// is reached, when io.EOF is returned. | |
NextSegment() (Segment, error) | |
IsBEG(Token) bool // Test for "begin" token | |
IsSOT(Token) bool // Test for "start of transcription" token | |
IsEOT(Token) bool // Test for "end of transcription" token | |
IsPREV(Token) bool // Test for "start of prev" token | |
IsSOLM(Token) bool // Test for "start of lm" token | |
IsNOT(Token) bool // Test for "No timestamps" token | |
IsLANG(Token, string) bool // Test for token associated with a specific language | |
IsText(Token) bool // Test for text token | |
// Timings | |
PrintTimings() | |
ResetTimings() | |
SystemInfo() string | |
} | |
// Segment is the text result of a speech recognition. | |
type Segment struct { | |
// Segment Number | |
Num int | |
// Time beginning and end timestamps for the segment. | |
Start, End time.Duration | |
// The text of the segment. | |
Text string | |
// The tokens of the segment. | |
Tokens []Token | |
} | |
// Token is a text or special token | |
type Token struct { | |
Id int | |
Text string | |
P float32 | |
Start, End time.Duration | |
} | |