Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CRC64 perf improvements from Redis patches #350

Merged
merged 10 commits into from May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/Makefile
Expand Up @@ -131,6 +131,9 @@ ifdef REDIS_LDFLAGS
endif

FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS)
ifeq ($(SERVER_TEST),yes)
FINAL_CFLAGS +=-DSERVER_TEST=1
endif
FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG)
FINAL_LIBS=-lm
DEBUG=-g -ggdb
Expand Down Expand Up @@ -383,11 +386,11 @@ endif
ENGINE_NAME=valkey
SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX)
ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX)
ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ)))
Expand Down
264 changes: 238 additions & 26 deletions src/crc64.c
Expand Up @@ -28,6 +28,7 @@

#include "crc64.h"
#include "crcspeed.h"
#include "serverassert.h"
zuiderkwast marked this conversation as resolved.
Show resolved Hide resolved
static uint64_t crc64_table[8][256] = {{0}};

#define POLY UINT64_C(0xad93d23594c935a9)
Expand Down Expand Up @@ -67,14 +68,33 @@ static uint64_t crc64_table[8][256] = {{0}};
* \return The reflected data.
*****************************************************************************/
static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) {
uint_fast64_t ret = data & 0x01;
/* only ever called for data_len == 64 in this codebase
josiahcarlson marked this conversation as resolved.
Show resolved Hide resolved
*
* Borrowed from bit twiddling hacks, original in the public domain.
* https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
* Extended to 64 bits, and added byteswap for final 3 steps.
* 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C)
*/

for (size_t i = 1; i < data_len; i++) {
data >>= 1;
ret = (ret << 1) | (data & 0x01);
}

return ret;
assert(data_len <= 64);
/* swap odd and even bits */
data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1);
/* swap consecutive pairs */
data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2);
/* swap nibbles ... */
data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4);
#if defined(__GNUC__) || defined(__clang__)
data = __builtin_bswap64(data);
#else
/* swap bytes */
data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8);
/* swap 2-byte long pairs */
data = ( data >> 16 & 0xFFFF0000FFFFULL) | ((data & 0xFFFF0000FFFFULL) << 16);
/* swap 4-byte quads */
data = ( data >> 32 & 0xFFFFFFFFULL) | ((data & 0xFFFFFFFFULL) << 32);
#endif
/* adjust for non-64-bit reversals */
return data >> (64 - data_len);
}

/**
Expand Down Expand Up @@ -126,29 +146,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
#ifdef SERVER_TEST
#include <stdio.h>

static void genBenchmarkRandomData(char *data, int count);
static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv);
static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv);
long long _ustime(void);

#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>

#include "zmalloc.h"
#include "crccombine.h"

long long _ustime(void) {
struct timeval tv;
long long ust;

gettimeofday(&tv, NULL);
ust = ((long long)tv.tv_sec)*1000000;
ust += tv.tv_usec;
return ust;
}

static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) {
uint64_t min = size, hash;
long long original_start = _ustime(), original_end;
for (long long i=passes; i > 0; i--) {
hash = crc64(0, data, size);
}
original_end = _ustime();
min = (original_end - original_start) * 1000 / passes;
/* approximate nanoseconds without nstime */
if (csv) {
printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n",
name, size, (1000 * size) / min, hash == check);
} else {
printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n",
size, name, (1000 * size) / min, hash == check);
}
return hash != check;
}

const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5);

static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) {
uint64_t min = size, start = expect, thash = expect ^ (expect >> 17);
long long original_start = _ustime(), original_end;
for (int i=0; i < 1000; i++) {
crc64_combine(thash, start, size, BENCH_RPOLY, 64);
}
original_end = _ustime();
/* ran 1000 times, want ns per, counted us per 1000 ... */
min = original_end - original_start;
if (csv) {
printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min);
} else {
printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min);
}
}

static void genBenchmarkRandomData(char *data, int count) {
static uint32_t state = 1234;
int i = 0;

while (count--) {
state = (state*1103515245+12345);
data[i++] = '0'+((state>>16)&63);
}
}

#define UNUSED(x) (void)(x)
int crc64Test(int argc, char *argv[], int flags) {
UNUSED(argc);
UNUSED(argv);
UNUSED(flags);
crc64_init();
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)_crc64(0, "123456789", 9));
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)_crc64(0, li, sizeof(li)));
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));

uint64_t crc64_test_size = 0;
int i, lastarg, csv = 0, loop = 0, combine = 0;
again:
for (i = 3; i < argc; i++) {
lastarg = (i == (argc-1));
if (!strcmp(argv[i],"--help")) {
goto usage;
} else if (!strcmp(argv[i],"--csv")) {
csv = 1;
} else if (!strcmp(argv[i],"-l")) {
loop = 1;
} else if (!strcmp(argv[i],"--crc")) {
if (lastarg) goto invalid;
crc64_test_size = atoll(argv[++i]);
} else if (!strcmp(argv[i],"--combine")) {
combine = 1;
} else {
invalid:
printf("Invalid option \"%s\" or option argument missing\n\n",argv[i]);
usage:
printf(
"Usage: crc64 [OPTIONS]\n\n"
" --csv Output in CSV format\n"
" -l Loop. Run the tests forever\n"
" --crc <bytes> Benchmark crc64 faster options, using a buffer this big, and quit when done.\n"
" --combine Benchmark crc64 combine value ranges and timings.\n"
);
return 1;
}
}

if (crc64_test_size == 0 && combine == 0) {
crc64_init();
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)_crc64(0, "123456789", 9));
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)_crc64(0, li, sizeof(li)));
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
return 0;

}

int init_this_loop = 1;
long long init_start, init_end;

do {
unsigned char* data = NULL;
uint64_t passes = 0;
if (crc64_test_size) {
data = zmalloc(crc64_test_size);
genBenchmarkRandomData((char*)data, crc64_test_size);
/* We want to hash about 1 gig of data in total, looped, to get a good
* idea of our performance.
*/
passes = (UINT64_C(0x100000000) / crc64_test_size);
passes = passes >= 2 ? passes : 2;
passes = passes <= 1000 ? passes : 1000;
}

crc64_init();
/* warm up the cache */
set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
uint64_t expect = crc64(0, data, crc64_test_size);

if (!combine && crc64_test_size) {
if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n");

/* get the single-character version for single-byte Redis behavior */
set_crc64_cutoffs(0, crc64_test_size+1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)) return 1;

set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
/* run with 8-byte "single" path, crcfaster */
if (bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv)) return 1;

/* run with dual 8-byte paths */
set_crc64_cutoffs(1, crc64_test_size+1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv)) return 1;

/* run with tri 8-byte paths */
set_crc64_cutoffs(1, 1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv)) return 1;

/* Be free memory region, be free. */
zfree(data);
data = NULL;
}

uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff);
if (combine) {
if (init_this_loop) {
init_start = _ustime();
crc64_combine(
UINT64_C(0xdeadbeefdeadbeef),
UINT64_C(0xfeebdaedfeebdaed),
INIT_SIZE,
BENCH_RPOLY, 64);
init_end = _ustime();

init_end -= init_start;
init_end *= 1000;
if (csv) {
printf("operation,size,nanoseconds\n");
printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end);
} else {
printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end);
}
/* use the hash itself as the size (unpredictable) */
bench_combine("hash_as_size_combine", crc64_test_size, expect, csv);

/* let's do something big (predictable, so fast) */
bench_combine("largest_combine", INIT_SIZE, expect, csv);
}
bench_combine("combine", crc64_test_size, expect, csv);
}
init_this_loop = 0;
/* step down by ~1.641 for a range of test sizes */
crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6);
} while (crc64_test_size > 3);
if (loop) goto again;
return 0;
}
# endif


#ifdef SERVER_TEST_MAIN
int main(int argc, char *argv[]) {
return crc64Test(argc, argv);
}

#endif