Skip to content

Commit

Permalink
CRC64 perf improvements from Redis patches
Browse files Browse the repository at this point in the history
* 53-73% faster on Xeon 2670 v0 @ 2.6ghz
* 2-2.5x faster on Core i3 8130U @ 2.2 ghz
* 1.6-2.46 bytes/cycle on i3 8130U
* likely >2x faster than crcspeed on newer CPUs with more resources than a 2012-era Xeon 2670
* crc64 combine function runs in <50 nanoseconds typical with vector + cache optimizations
  (~8 *microseconds* without vector optimizations, ~80 *microseconds without cache,
  the combination is extra effective)
* still single-threaded
* valkey-server test crc64 --help (requires `make distclean && make SERVER_TEST=yes`)
  • Loading branch information
josiahcarlson committed Apr 22, 2024
1 parent a989ee5 commit ec8fdf2
Show file tree
Hide file tree
Showing 7 changed files with 573 additions and 49 deletions.
4 changes: 4 additions & 0 deletions README.md
Expand Up @@ -54,6 +54,10 @@ installed):
% ./utils/gen-test-certs.sh
% ./runtest --tls

To build and test crc 64 performance improvements:

% make distclean && make SEVER_TEST=yes
% src/valkey-server test crc64 --crc 10000000

Fixing build problems with dependencies or cached build options
---------
Expand Down
9 changes: 6 additions & 3 deletions src/Makefile
Expand Up @@ -36,6 +36,9 @@ NODEPS:=clean distclean

# Default settings
STD=-pedantic -DSERVER_STATIC=''
ifeq ($(SERVER_TEST),yes)
STD +=-DSERVER_TEST=1
endif

# Use -Wno-c11-extensions on clang, either where explicitly used or on
# platforms we can assume it's being used.
Expand Down Expand Up @@ -383,11 +386,11 @@ endif
ENGINE_NAME=valkey
SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX)
ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX)
ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ)))
Expand Down
262 changes: 236 additions & 26 deletions src/crc64.c
Expand Up @@ -67,14 +67,32 @@ static uint64_t crc64_table[8][256] = {{0}};
* \return The reflected data.
*****************************************************************************/
static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) {
uint_fast64_t ret = data & 0x01;
/* only ever called for data_len == 64 in this codebase
*
* Borrowed from bit twiddling hacks, original in the public domain.
* https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
* Extended to 64 bits, and added byteswap for final 3 steps.
* 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C)
*/

for (size_t i = 1; i < data_len; i++) {
data >>= 1;
ret = (ret << 1) | (data & 0x01);
}

return ret;
/* swap odd and even bits */
data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1);
/* swap consecutive pairs */
data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2);
/* swap nibbles ... */
data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4);
#if defined(__GNUC__) || defined(__clang__)
data = __builtin_bswap64(data);
#else
/* swap bytes */
data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8);
/* swap 2-byte long pairs */
data = ( data >> 16 & 0xFFFF0000FFFFULL) | ((data & 0xFFFF0000FFFFULL) << 16);
/* swap 4-byte quads */
data = ( data >> 32 & 0xFFFFFFFFULL) | ((data & 0xFFFFFFFFULL) << 32);
#endif
/* adjust for non-64-bit reversals */
return data >> (64 - data_len);
}

/**
Expand Down Expand Up @@ -126,29 +144,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
#ifdef SERVER_TEST
#include <stdio.h>

static void genBenchmarkRandomData(char *data, int count);
static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv);
static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv);
long long _ustime(void);

#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>

#include "zmalloc.h"
#include "crccombine.h"

long long _ustime(void) {
struct timeval tv;
long long ust;

gettimeofday(&tv, NULL);
ust = ((long long)tv.tv_sec)*1000000;
ust += tv.tv_usec;
return ust;
}

static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) {
uint64_t min = size, hash;
long long original_start = _ustime(), original_end;
for (long long i=passes; i > 0; i--) {
hash = crc64(0, data, size);
}
original_end = _ustime();
min = (original_end - original_start) * 1000 / passes;
/* approximate nanoseconds without nstime */
if (csv) {
printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n",
name, size, (1000 * size) / min, hash == check);
} else {
printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n",
size, name, (1000 * size) / min, hash == check);
}
return hash != check;
}

const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5);

static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) {
uint64_t min = size, start = expect, thash = expect ^ (expect >> 17);
long long original_start = _ustime(), original_end;
for (int i=0; i < 1000; i++) {
crc64_combine(thash, start, size, BENCH_RPOLY, 64);
}
original_end = _ustime();
/* ran 1000 times, want ns per, counted us per 1000 ... */
min = original_end - original_start;
if (csv) {
printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min);
} else {
printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min);
}
}

static void genBenchmarkRandomData(char *data, int count) {
static uint32_t state = 1234;
int i = 0;

while (count--) {
state = (state*1103515245+12345);
data[i++] = '0'+((state>>16)&63);
}
}

#define UNUSED(x) (void)(x)
int crc64Test(int argc, char *argv[], int flags) {
UNUSED(argc);
UNUSED(argv);
UNUSED(flags);
crc64_init();
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)_crc64(0, "123456789", 9));
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)_crc64(0, li, sizeof(li)));
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));

uint64_t crc64_test_size = 0;
int i, lastarg, csv = 0, loop = 0, combine = 0;
again:
for (i = 3; i < argc; i++) {
lastarg = (i == (argc-1));
if (!strcmp(argv[i],"--help")) {
goto usage;
} else if (!strcmp(argv[i],"--csv")) {
csv = 1;
} else if (!strcmp(argv[i],"-l")) {
loop = 1;
} else if (!strcmp(argv[i],"--crc")) {
if (lastarg) goto invalid;
crc64_test_size = atoll(argv[++i]);
} else if (!strcmp(argv[i],"--combine")) {
combine = 1;
} else {
invalid:
printf("Invalid option \"%s\" or option argument missing\n\n",argv[i]);
usage:
printf(
"Usage: crc64 [OPTIONS]\n\n"
" --csv Output in CSV format\n"
" -l Loop. Run the tests forever\n"
" --crc <bytes> Benchmark crc64 faster options, using a buffer this big, and quit when done.\n"
" --combine Benchmark crc64 combine value ranges and timings.\n"
);
return 1;
}
}

if (crc64_test_size == 0 && combine == 0) {
crc64_init();
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)_crc64(0, "123456789", 9));
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)_crc64(0, li, sizeof(li)));
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
return 0;

}

int init_this_loop = 1;
long long init_start, init_end;

do {
unsigned char* data = NULL;
uint64_t passes = 0;
if (crc64_test_size) {
data = zmalloc(crc64_test_size);
genBenchmarkRandomData((char*)data, crc64_test_size);
/* We want to hash about 1 gig of data in total, looped, to get a good
* idea of our performance.
*/
passes = (UINT64_C(0x100000000) / crc64_test_size);
passes = passes >= 2 ? passes : 2;
passes = passes <= 1000 ? passes : 1000;
}

crc64_init();
/* warm up the cache */
set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
uint64_t expect = crc64(0, data, crc64_test_size);

if (!combine && crc64_test_size) {
if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n");

/* get the single-character version for single-byte Redis behavior */
set_crc64_cutoffs(0, crc64_test_size+1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)) return 1;

set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
/* run with 8-byte "single" path, crcfaster */
if (bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv)) return 1;

/* run with dual 8-byte paths */
set_crc64_cutoffs(1, crc64_test_size+1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv)) return 1;

/* run with tri 8-byte paths */
set_crc64_cutoffs(1, 1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv)) return 1;

/* Be free memory region, be free. */
zfree(data);
data = NULL;
}

uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff);
if (combine) {
if (init_this_loop) {
init_start = _ustime();
crc64_combine(
UINT64_C(0xdeadbeefdeadbeef),
UINT64_C(0xfeebdaedfeebdaed),
INIT_SIZE,
BENCH_RPOLY, 64);
init_end = _ustime();

init_end -= init_start;
init_end *= 1000;
if (csv) {
printf("operation,size,nanoseconds\n");
printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end);
} else {
printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end);
}
/* use the hash itself as the size (unpredictable) */
bench_combine("hash_as_size_combine", crc64_test_size, expect, csv);

/* let's do something big (predictable, so fast) */
bench_combine("largest_combine", INIT_SIZE, expect, csv);
}
bench_combine("combine", crc64_test_size, expect, csv);
}
init_this_loop = 0;
/* step down by ~1.641 for a range of test sizes */
crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6);
} while (crc64_test_size > 3);
if (loop) goto again;
return 0;
}
# endif


#ifdef SERVER_TEST_MAIN
int main(int argc, char *argv[]) {
return crc64Test(argc, argv);
}

#endif

0 comments on commit ec8fdf2

Please sign in to comment.