Skip to content

Commit

Permalink
ref ggerganov#9 : add API documentation in whisper.h
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Oct 8, 2022
1 parent 380dfe5 commit 72aebcc
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 9 deletions.
8 changes: 4 additions & 4 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,25 +230,25 @@ int main(int argc, char ** argv) {

// print result
if (!wparams.print_realtime) {
fprintf(stderr, "\n");
printf("\n");

const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);

if (params.no_timestamps) {
fprintf(stderr, "%s", text);
printf("%s", text);
fflush(stdout);
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
}
}
}

fprintf(stderr, "\n");
printf("\n");

// output to text file
if (params.output_txt) {
Expand Down
74 changes: 69 additions & 5 deletions whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,72 +31,131 @@ extern "C" {
// C interface
//
// TODO: documentation will come soon
//
// Basic usage:
//
// #include "whisper.h"
//
// ...
//
// struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
//
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n");
// return 7;
// }
//
// const int n_segments = whisper_full_n_segments(ctx);
// for (int i = 0; i < n_segments; ++i) {
// const char * text = whisper_full_get_segment_text(ctx, i);
// printf("%s", text);
// }
//
// whisper_free(ctx);
//
// ...
//
// This is a demonstration of the most straightforward usage of the library.
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
//
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
// understanding of how the model works.
//

struct whisper_context;

typedef int whisper_token;

// Allocates all memory needed for the model and loads the model from the given file.
// Returns NULL on failure.
WHISPER_API struct whisper_context * whisper_init(const char * path_model);

// Frees all memory allocated by the model.
WHISPER_API void whisper_free(struct whisper_context * ctx);

// Convert RAW PCM audio to log mel spectrogram.
// The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx,
const float * samples,
int n_samples,
int n_threads);

// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80
// Returns 0 on success
WHISPER_API int whisper_set_mel(
struct whisper_context * ctx,
const float * data,
int n_len,
int n_mel);

// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// offset can be used to specify the offset of the first frame in the spectrogram.
// Returns 0 on success
WHISPER_API int whisper_encode(
struct whisper_context * ctx,
int offset,
int n_threads);

// Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first.
// tokens + n_tokens is the provided context for the decoder.
// n_past is the number of tokens to use from previous decoder calls.
// Returns 0 on success
WHISPER_API int whisper_decode(
struct whisper_context * ctx,
const whisper_token * tokens,
int n_tokens,
int n_past,
int n_threads);

// Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode().
// You can also implement your own sampling method using the whisper_get_probs() function.
// whisper_sample_best() returns the token with the highest probability
// whisper_sample_timestamp() returns the most probable timestamp token
WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp);
WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);

// return the id of the specified language, returns -1 if not found
// Return the id of the specified language, returns -1 if not found
WHISPER_API int whisper_lang_id(const char * lang);

WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);

// The probabilities for the next token
WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);

// Token Id -> String. Uses the vocabulary in the provided context
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);

// Special tokens
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);

// Task tokens
WHISPER_API whisper_token whisper_token_translate ();
WHISPER_API whisper_token whisper_token_transcribe();

// Performance information
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);

////////////////////////////////////////////////////////////////////////////

// Available decoding strategies
enum whisper_decode_strategy {
WHISPER_DECODE_GREEDY,
WHISPER_DECODE_BEAM_SEARCH,
WHISPER_DECODE_GREEDY, // Always select the most probable token
WHISPER_DECODE_BEAM_SEARCH, // TODO: not implemented yet!
};

struct whisper_full_params {
Expand Down Expand Up @@ -129,18 +188,23 @@ extern "C" {

WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy);

// full whisper run - encode + decode
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full(
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples);

// Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph.
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);

// Get the start and end time of the specified segment.
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);

// Get the text of the specified segment.
WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);

#ifdef __cplusplus
Expand Down

0 comments on commit 72aebcc

Please sign in to comment.