Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP Very rough cut of streaming from stdin. #1823

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ libwhisper.so: $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)

clean:
rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
rm -f *.o main stream stream-stdin command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so

#
# Examples
Expand All @@ -375,6 +375,9 @@ server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

stream-stdin: examples/stream-stdin/stream-stdin.cpp $(SRC_COMMON) examples/audio-stdin.h examples/audio-stdin.cpp $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream-stdin/stream-stdin.cpp $(SRC_COMMON) examples/audio-stdin.cpp $(WHISPER_OBJ) -o stream-stdin $(LDFLAGS)

command: examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)

Expand Down
14 changes: 14 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ target_link_libraries(${TARGET} PRIVATE whisper)

set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)

# audio-stdin

set(TARGET audio-stdin)

add_library(${TARGET} STATIC
audio-stdin.h
audio-stdin.cpp
)

include(DefaultTargetOptions)

set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)

if (WHISPER_SDL2)
# common-sdl

Expand Down Expand Up @@ -69,6 +82,7 @@ elseif(CMAKE_JS_VERSION)
else()
add_subdirectory(main)
add_subdirectory(stream)
add_subdirectory(stream-stdin)
add_subdirectory(server)
add_subdirectory(command)
add_subdirectory(bench)
Expand Down
185 changes: 185 additions & 0 deletions examples/audio-stdin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#include "audio-stdin.h"

#include <csignal>
#include <cstring>

// Because the original happened to handle OS signals in the same library as
// handled the audio, this is implemented here.
// TODO: split this out to something a bit more coherent

bool should_quit = false;

void quit_signal_handler(int signal) {
if (signal == SIGINT || signal == SIGTERM) {
should_quit = true;
}
}

void install_signal_handler() {
std::signal(SIGINT, quit_signal_handler);
std::signal(SIGTERM, quit_signal_handler);
}

bool should_keep_running() {
return !should_quit;
}



audio_stdin::audio_stdin(int len_ms) {
m_len_ms = len_ms;

m_running = false;
}

audio_stdin::~audio_stdin() {
// Nothing to do here, we don't own m_fd
}

/*
Setup the stdin reader. For simplicity, let's say that the file descriptor
passed in needs to already be open, and that the destructor doesn't close it.
*/
bool audio_stdin::init(int fd, int sample_rate) {
m_fd = fd;
m_sample_rate = sample_rate;

size_t buffer_size = (m_sample_rate*m_len_ms)/1000;
m_audio.resize(buffer_size);
m_in_buffer.resize(buffer_size);

return true;
}

bool audio_stdin::resume() {
// In this initial implementation, we're assuming that we don't even have to
// do anything in the background. Getting data off stdin can be assumed to be
// fast enough that we can do it synchronously, so `resume` can be a noop.
m_running = true;
return true;
}

bool audio_stdin::pause() {
// Similarly to `resume`, we don't need to do anything here. We just never
// read from stdin.
m_running = false;
return true;
}

bool audio_stdin::clear() {

if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return false;
}

// Now while *we're* not doing anything with threads, that doen't mean
// nobody else is
{
std::lock_guard<std::mutex> lock(m_mutex);

m_audio_pos = 0;
m_audio_len = 0;
}

return true;
}

// Important: len is a number of bytes
bool audio_stdin::callback(int len) {
// We aren't called by SDL. Instead we're called whenever whisper runs close enough to
// being out of audio that the next iteration would stall.
if (!m_running) {
return true;
}

size_t n_samples = len / sizeof(float);

if (n_samples > m_audio.size()) {
n_samples = m_audio.size();
}

{
// std::lock_guard<std::mutex> lock(m_mutex);

// stdin is PCM mono 16khz in s16le format. Use ffmpeg to make that happen.
int nread = read(m_fd, m_in_buffer.data(), m_in_buffer.size());
if (nread < 0) { /* TODO then we need to barf, that's a fail */ }
else if (nread == 0) { should_quit = true; return false; }

//Nicked this from drwav.h, we're basically doing the same as drwav_s16_to_f32
float scale_factor = 0.000030517578125f;

if (m_audio_pos + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - m_audio_pos;

// Now we pull as much as we need from stdin, blocking if we have to

for(int i = m_audio_pos; i < n0; i+=4) {
m_audio[i] = m_in_buffer[i] * scale_factor;
}
for(int i = 0; i < n_samples - n0; i++) {
m_audio[i] = m_in_buffer[i] * scale_factor;
}

m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = m_audio.size();
} else {
for(int i = 0; i < n_samples; i++) {
m_audio[i] = m_in_buffer[i] * scale_factor;
}

m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
}
}
return true;
}

// Identical to audio_async, except that we can signal that the audio stream is closed
bool audio_stdin::get(int ms, std::vector<float> & result) {

if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return true;
}

result.clear();

{
std::lock_guard<std::mutex> lock(m_mutex);

if (ms <= 0) {
ms = m_len_ms;
}

size_t n_samples = (m_sample_rate * ms) / 1000;

// we're double-buffering here, but I'll take that out if it works.
// It's just ungainly, not actually a problem.

if (!callback(n_samples * sizeof(float))){
return false;
}

if (n_samples > m_audio_len) {
n_samples = m_audio_len;
}
result.resize(n_samples);

int s0 = m_audio_pos - n_samples;
if (s0 < 0) {
s0 += m_audio.size();
}

if (s0 + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - s0;

memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
} else {
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
}
}
return true;
}
54 changes: 54 additions & 0 deletions examples/audio-stdin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#pragma once

#include <atomic>
#include <cstdint>
#include <vector>
#include <mutex>

//
// Stdin wav capture
//

class audio_stdin {
public:
audio_stdin(int len_ms);
~audio_stdin();

bool init(int fd, int sample_rate);

// The sdl version captures to a circular buffer; I think we should assume that we need the same
// start capturing audio via the provided SDL callback
// keep last len_ms seconds of audio in a circular buffer
bool resume();
bool pause();
bool clear();

// callback to be called when we run out of data.
// This is what will do a read() from stdin, and can block if there isn't
// enough data yet.
// Returns false if the stream's closed.
bool callback(int len);

// get audio data from the circular buffer
// Returns false if the stream's closed.
bool get(int ms, std::vector<float> & audio);

private:
int m_fd = 0;
int m_len_ms = 0;
int m_sample_rate = 0;

std::atomic_bool m_running;
std::mutex m_mutex;

std::vector<float> m_audio;
// Since the data we plan on receiving needs converting, we need somewhere to hold it while we do that
std::vector<int16_t> m_in_buffer;
size_t m_audio_pos = 0;
size_t m_audio_len = 0;
};

// Return false if need to quit - goes false at eof?
bool should_keep_running();
// Call this before needing to quit.
void install_signal_handler();
7 changes: 7 additions & 0 deletions examples/stream-stdin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# stream-stdin
set(TARGET stream-stdin)
add_executable(${TARGET} stream-stdin.cpp)

include(DefaultTargetOptions)

target_link_libraries(${TARGET} PRIVATE common audio-stdin whisper ${CMAKE_THREAD_LIBS_INIT})
bobqianic marked this conversation as resolved.
Show resolved Hide resolved
23 changes: 23 additions & 0 deletions examples/stream-stdin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# stream-stdin

This is a naive example of performing real-time inference on audio
from your microphone. The `stream-stdin` tool samples the audio every
half a second and runs the transcription continously, just like the
`stream` example. Only it doesn't need to be compiled with SDL to do
it.

```shell
$ ffmpeg -i capture.wav -acodec pcm_s16le -f s16le -ac 1 -ar 16000 - | ./stream-stdin -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```

It expects raw, mono audio on stdin. Because it's raw it can't tell
what the format is to convert it from anything else.

## Why this matters

Because you can stream the audio over the network with `ffmpeg`'s
`rtp` support. Or you can just use `netcat`.

## Building

$ `make stream-stdin`