Skip to content

Commit

Permalink
Synchronize after every op
Browse files Browse the repository at this point in the history
The previous commit loosened the synchronization barriers too much, and
it caused some rare brain damage when using quants. With this change,
we're now only using 3-4x fewer sychronization barriers than before.

A new --trap flag has been introduced by this change, which enables the
runtime to detect the precise moment a NaN appears anywhere in the code
and then aborts with both a C++ backtrace and a GGML graph trace.

Some jankiness with pledge() and /dev/tty is also fixed by this change.
  • Loading branch information
jart committed Apr 30, 2024
1 parent bd8c0de commit 622924c
Show file tree
Hide file tree
Showing 8 changed files with 295 additions and 81 deletions.
4 changes: 2 additions & 2 deletions build/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
INSTALL = install

ARFLAGS = rcsD
CCFLAGS = -g -O3 -fexceptions
CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes
CCFLAGS = -g -O3 -fexceptions -fsignaling-nans
CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes -DLLAMAFILE_DEBUG
TARGET_ARCH = -Xx86_64-mavx -Xx86_64-mtune=znver4

TMPDIR = o//tmp
Expand Down
6 changes: 6 additions & 0 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (arg == "--cli") {
return true;
}
if (arg == "--trap") {
FLAG_trap = true;
FLAG_unsecure = true; // for better backtraces
llamafile_trapping_enabled(+1);
return true;
}
if (arg == "--unsecure") {
FLAG_unsecure = true;
return true;
Expand Down
45 changes: 32 additions & 13 deletions llama.cpp/console.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi

#include "console.h"

#include <vector>
#include <iostream>

#include <climits>
#include <sys/ioctl.h>
#include <unistd.h>
#include <wchar.h>
#include <cosmo.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
Expand All @@ -30,6 +32,7 @@ namespace console {

static bool advanced_display = false;
static bool simple_io = true;
static bool should_close_tty = false;
static display_t current_display = reset;
static FILE* out = stdout;
static FILE* tty = nullptr;
Expand All @@ -40,19 +43,32 @@ static termios initial_state;
//

void init(bool use_simple_io, bool use_advanced_display) {
advanced_display = use_advanced_display;
should_close_tty = false;
simple_io = use_simple_io;
advanced_display = use_advanced_display;
if (!simple_io) {
struct termios new_termios;
tcgetattr(STDIN_FILENO, &initial_state);
new_termios = initial_state;
new_termios.c_lflag &= ~(ICANON | ECHO);
new_termios.c_cc[VMIN] = 1;
new_termios.c_cc[VTIME] = 0;
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
tty = fopen("/dev/tty", "w+e");
if (tty) {
should_close_tty = true;
} else if (IsLinux() || IsOpenbsd()) {
// this could happen because pledge() blocked us
tty = fdopen(0, "w+e");
}
if (tty != nullptr) {
out = tty;
if (!tcgetattr(fileno(tty), &initial_state)) {
out = tty;
struct termios new_termios = initial_state;
new_termios.c_lflag &= ~(ICANON | ECHO);
new_termios.c_cc[VMIN] = 1;
new_termios.c_cc[VTIME] = 0;
tcsetattr(fileno(tty), TCSANOW, &new_termios);
} else {
simple_io = true;
fclose(tty);
tty = 0;
}
} else {
simple_io = true;
}
}
setlocale(LC_ALL, "");
Expand All @@ -64,11 +80,14 @@ void cleanup() {
// Restore settings
if (!simple_io) {
if (tty != nullptr) {
out = stdout;
fclose(tty);
fflush(tty);
tcsetattr(fileno(tty), TCSANOW, &initial_state);
if (should_close_tty) {
fclose(tty);
}
tty = nullptr;
out = stdout;
}
tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
}
}

Expand Down
78 changes: 28 additions & 50 deletions llama.cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2679,6 +2679,7 @@ static inline int ggml_up(int n, int m) {
struct ggml_context * ggml_init(struct ggml_init_params params) {
// make this function thread safe
ggml_critical_section_start();
llamafile_trapping_enabled(-1);

static bool is_first_call = true;

Expand Down Expand Up @@ -2750,6 +2751,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
if (ctx == NULL) {
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);

llamafile_trapping_enabled(+1);
ggml_critical_section_end();

return NULL;
Expand Down Expand Up @@ -2781,6 +2783,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

llamafile_trapping_enabled(+1);
ggml_critical_section_end();

return ctx;
Expand Down Expand Up @@ -7105,19 +7108,12 @@ void ggml_syncthreads(struct ggml_compute_params * params) {
// ops must call this before accessing wdata
// otherwise overlapping threads on previous op might clobber
void *ggml_acquire_wdata(struct ggml_compute_params * params) {
if (params->limbo) {
ggml_syncthreads(params);
}
params->limbo = true;
return params->wdata;
}

// ops should call this after writing to wdata before reading
void ggml_release_wdata(struct ggml_compute_params * params) {
if (params->limbo) {
ggml_syncthreads(params);
params->limbo = false;
}
ggml_syncthreads(params);
}

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -11531,12 +11527,9 @@ static void ggml_compute_forward_set_f32(

if (!inplace) {
if (!params->ith) {
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
((char *) dst->data),
((char *) src0->data),
ggml_nbytes(dst));
memcpy(((char *) dst->data),
((char *) src0->data),
ggml_nbytes(dst));
}
ggml_syncthreads(params);
}
Expand Down Expand Up @@ -17759,35 +17752,6 @@ struct ggml_compute_state {
enum ggml_status ec;
};

// returns true if `src` is direct dependency of `node`
static bool ggml_has_src(const struct ggml_tensor * node,
const struct ggml_tensor * src) {
for (int i = 0; i < GGML_MAX_SRC; ++i) {
if (node->src[i] == src) {
return true;
}
}
return false;
}

// returns true if `dest` depends on any outputs since `mark`
//
// - cgraph->nodes[dest] is about to be executed
// - syncthreads() last happened right before cgraph->nodes[mark] executed
// - syncthreads() has 1.8 µs overhead minimum on Apple M2 when nth == 12
//
static bool ggml_needs_barrier(const struct ggml_cgraph * cgraph, int mark, int dest) {
assert(mark >= 0);
assert(mark <= dest);
assert(dest < cgraph->n_nodes);
for (; mark < dest; ++mark) {
if (ggml_has_src(cgraph->nodes[dest], cgraph->nodes[mark])) {
return true;
}
}
return false;
}

static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
const struct ggml_cgraph * cgraph = state->shared->cgraph;
Expand All @@ -17804,6 +17768,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

set_numa_thread_affinity(state->ith);

#ifdef LLAMAFILE_DEBUG
if (FLAG_trap) {
llamafile_trapping_enabled(+1);
}
#endif

#ifdef GGML_PERF
int64_t start_cycles, start_time_us;
if (!state->ith) {
Expand All @@ -17819,17 +17789,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
return 0;
}

if (ggml_needs_barrier(cgraph, mark, node_n)) {
ggml_syncthreads(&params);
mark = node_n;
}
#ifdef LLAMAFILE_DEBUG
llamafile_debug_op_index = node_n;
#endif

struct ggml_tensor *node = cgraph->nodes[node_n];
params.nth = state->shared->n_threads;
ggml_compute_forward(&params, node);

// this barrier could potentially be eliminated in 15%+ of cases
// however, it would give rise to ghoulish errors w/ little gain
ggml_syncthreads(&params);

#if GGML_PERF
ggml_syncthreads(&state->shared->barrier);
if (!state->ith) {
int64_t end_cycles = ggml_perf_cycles();
int64_t end_time_us = ggml_perf_time_us();
Expand All @@ -17844,8 +17816,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
#endif
}

ggml_syncthreads(&params);

return 0;
}

Expand Down Expand Up @@ -18075,6 +18045,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);

#ifdef LLAMAFILE_DEBUG
llamafile_debug_graph = cgraph;
#endif

// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
Expand Down Expand Up @@ -18134,6 +18108,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl

// fprintf(stderr, "%6d barriers %6d ops\n", state_shared.barrier.phase, cgraph->n_nodes);

#ifdef LLAMAFILE_DEBUG
llamafile_debug_graph = 0;
#endif

return compute_status;
}

Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,6 @@ extern "C" {
void * wdata;

struct ggml_barrier * barrier;
bool limbo;
};

GGML_API void ggml_syncthreads (struct ggml_compute_params *);
Expand Down
29 changes: 15 additions & 14 deletions llama.cpp/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,20 +184,6 @@ int main(int argc, char ** argv) {
return 1;
}

if (!params.mmproj.empty() &&
(!params.image.empty() ||
params.prompt.find("<img src=\"") != std::string::npos)) {
return llava_cli(argc, argv, &params);
}

// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));

// save choice to use color for later
// (note for later: this is a slightly awkward choice)
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });

if (!FLAG_unsecure && !llamafile_has_gpu()) {
// Enable pledge() security on Linux and OpenBSD.
// - We do this *after* opening the log file for writing.
Expand All @@ -213,6 +199,7 @@ int main(int argc, char ** argv) {
} else {
promises = "stdio rpath tty";
}
__pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
if (pledge(0, 0)) {
LOG_TEE("warning: this OS doesn't support pledge() security\n");
} else if (pledge(promises, 0)) {
Expand All @@ -221,6 +208,20 @@ int main(int argc, char ** argv) {
}
}

if (!params.mmproj.empty() &&
(!params.image.empty() ||
params.prompt.find("<img src=\"") != std::string::npos)) {
return llava_cli(argc, argv, &params);
}

// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));

// save choice to use color for later
// (note for later: this is a slightly awkward choice)
console::init(!params.interactive || params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });

if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
Expand Down

0 comments on commit 622924c

Please sign in to comment.