ggerganov · regularfry · Feb 1, 2024 · Feb 1, 2024 · Feb 3, 2024 · Feb 3, 2024
diff --git a/Makefile b/Makefile
@@ -348,7 +348,7 @@ libwhisper.so: $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)
 
 clean:
-	rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
+	rm -f *.o main stream stream-stdin command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
 
 #
 # Examples
@@ -375,6 +375,9 @@ server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
 
+stream-stdin: examples/stream-stdin/stream-stdin.cpp $(SRC_COMMON) examples/audio-stdin.h examples/audio-stdin.cpp $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/stream-stdin/stream-stdin.cpp $(SRC_COMMON) examples/audio-stdin.cpp $(WHISPER_OBJ) -o stream-stdin $(LDFLAGS)
+
 command: examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -36,6 +36,19 @@ target_link_libraries(${TARGET} PRIVATE whisper)
 
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+# audio-stdin
+
+set(TARGET audio-stdin)
+
+add_library(${TARGET} STATIC
+    audio-stdin.h
+    audio-stdin.cpp
+    )
+
+include(DefaultTargetOptions)
+
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
 if (WHISPER_SDL2)
     # common-sdl
 
@@ -69,6 +82,7 @@ elseif(CMAKE_JS_VERSION)
 else()
     add_subdirectory(main)
     add_subdirectory(stream)
+    add_subdirectory(stream-stdin)
     add_subdirectory(server)
     add_subdirectory(command)
     add_subdirectory(bench)

diff --git a/examples/audio-stdin.cpp b/examples/audio-stdin.cpp
@@ -0,0 +1,185 @@
+#include "audio-stdin.h"
+
+#include <csignal>
+#include <cstring>
+
+// Because the original happened to handle OS signals in the same library as
+// handled the audio, this is implemented here.
+// TODO: split this out to something a bit more coherent
+
+bool should_quit = false;
+
+void quit_signal_handler(int signal) {
+  if (signal == SIGINT || signal == SIGTERM) {
+    should_quit = true;
+  }
+}
+
+void install_signal_handler() {
+    std::signal(SIGINT, quit_signal_handler);
+    std::signal(SIGTERM, quit_signal_handler);
+}
+
+bool should_keep_running() {
+  return !should_quit;
+}
+
+
+
+audio_stdin::audio_stdin(int len_ms) {
+     m_len_ms = len_ms;
+
+     m_running = false;
+}
+
+audio_stdin::~audio_stdin() {
+  // Nothing to do here, we don't own m_fd
+}
+
+/*
+Setup the stdin reader.  For simplicity, let's say that the file descriptor
+passed in needs to already be open, and that the destructor doesn't close it.
+*/
+bool audio_stdin::init(int fd, int sample_rate) {
+  m_fd = fd;
+  m_sample_rate = sample_rate;
+
+  size_t buffer_size = (m_sample_rate*m_len_ms)/1000;
+  m_audio.resize(buffer_size);
+  m_in_buffer.resize(buffer_size);
+
+  return true;
+}
+
+bool audio_stdin::resume() {
+  // In this initial implementation, we're assuming that we don't even have to
+  // do anything in the background.  Getting data off stdin can be assumed to be
+  // fast enough that we can do it synchronously, so `resume` can be a noop.
+  m_running = true;
+  return true;
+}
+
+bool audio_stdin::pause() {
+  // Similarly to `resume`, we don't need to do anything here.  We just never
+  // read from stdin.
+  m_running = false;
+  return true;
+}
+
+bool audio_stdin::clear() {
+
+    if (!m_running) {
+      fprintf(stderr, "%s: not running!\n", __func__);
+      return false;
+    }
+
+    // Now while *we're* not doing anything with threads, that doen't mean
+    // nobody else is
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+
+      m_audio_pos = 0;
+      m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// Important: len is a number of bytes
+bool audio_stdin::callback(int len) {
+  // We aren't called by SDL.  Instead we're called whenever whisper runs close enough to
+  // being out of audio that the next iteration would stall.
+    if (!m_running) {
+        return true;
+    }
+
+    size_t n_samples = len / sizeof(float);
+
+    if (n_samples > m_audio.size()) {
+        n_samples = m_audio.size();
+    }
+
+    {
+      //        std::lock_guard<std::mutex> lock(m_mutex);
+
+	// stdin is PCM mono 16khz in s16le format.  Use ffmpeg to make that happen.
+        int nread = read(m_fd, m_in_buffer.data(), m_in_buffer.size());
+	if (nread < 0) { /* TODO then we need to barf, that's a fail */ }
+	else if (nread == 0) { should_quit = true; return false; }
+
+	//Nicked this from drwav.h, we're basically doing the same as drwav_s16_to_f32
+	float scale_factor = 0.000030517578125f;
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+	    // Now we pull as much as we need from stdin, blocking if we have to
+
+	    for(int i = m_audio_pos; i < n0; i+=4) {
+	      m_audio[i] = m_in_buffer[i] * scale_factor;
+ 	    }
+	    for(int i = 0; i < n_samples - n0; i++) {
+	      m_audio[i] = m_in_buffer[i] * scale_factor;
+	    }
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+	    for(int i = 0; i < n_samples; i++) {
+	      m_audio[i] = m_in_buffer[i] * scale_factor;
+	    }
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+    return true;
+}
+
+// Identical to audio_async, except that we can signal that the audio stream is closed
+bool audio_stdin::get(int ms, std::vector<float> & result) {
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return true;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+
+	// we're double-buffering here, but I'll take that out if it works.
+	// It's just ungainly, not actually a problem.
+
+	if (!callback(n_samples * sizeof(float))){
+	  return false;
+	}
+
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+    return true;
+}
diff --git a/examples/audio-stdin.h b/examples/audio-stdin.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <vector>
+#include <mutex>
+
+//
+// Stdin wav capture
+//
+
+class audio_stdin {
+public:
+    audio_stdin(int len_ms);
+    ~audio_stdin();
+
+    bool init(int fd, int sample_rate);
+
+  // The sdl version captures to a circular buffer; I think we should assume that we need the same
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called when we run out of data.
+    // This is what will do a read() from stdin, and can block if there isn't
+    // enough data yet.
+    // Returns false if the stream's closed.
+    bool callback(int len);
+
+    // get audio data from the circular buffer
+    // Returns false if the stream's closed.
+    bool get(int ms, std::vector<float> & audio);
+
+private:
+    int m_fd = 0;
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    std::atomic_bool m_running;
+    std::mutex       m_mutex;
+
+    std::vector<float> m_audio;
+    // Since the data we plan on receiving needs converting, we need somewhere to hold it while we do that
+    std::vector<int16_t> m_in_buffer;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+// Return false if need to quit - goes false at eof?
+bool should_keep_running();
+// Call this before needing to quit.
+void install_signal_handler();
diff --git a/examples/stream-stdin/CMakeLists.txt b/examples/stream-stdin/CMakeLists.txt
@@ -0,0 +1,7 @@
+# stream-stdin
+set(TARGET stream-stdin)
+add_executable(${TARGET} stream-stdin.cpp)
+
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common audio-stdin whisper ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/stream-stdin/README.md b/examples/stream-stdin/README.md
@@ -0,0 +1,23 @@
+# stream-stdin
+
+This is a naive example of performing real-time inference on audio
+from your microphone.  The `stream-stdin` tool samples the audio every
+half a second and runs the transcription continously, just like the
+`stream` example.  Only it doesn't need to be compiled with SDL to do
+it.
+
+```shell
+$ ffmpeg -i capture.wav -acodec pcm_s16le -f s16le -ac 1 -ar 16000 - | ./stream-stdin -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
+```
+
+It expects raw, mono audio on stdin.  Because it's raw it can't tell
+what the format is to convert it from anything else.
+
+## Why this matters
+
+Because you can stream the audio over the network with `ffmpeg`'s
+`rtp` support.  Or you can just use `netcat`.
+
+## Building
+
+$ `make stream-stdin`