diff --git a/src/Makefile b/src/Makefile index 47b961862a..acdae9d400 100644 --- a/src/Makefile +++ b/src/Makefile @@ -131,6 +131,9 @@ ifdef REDIS_LDFLAGS endif FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS) +ifeq ($(SERVER_TEST),yes) + FINAL_CFLAGS +=-DSERVER_TEST=1 +endif FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm DEBUG=-g -ggdb @@ -383,11 +386,11 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) -ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o +ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) -ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o +ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX) ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX) ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ))) diff --git a/src/crc64.c b/src/crc64.c index 0f71eea780..9d4e98ee70 100644 --- a/src/crc64.c +++ b/src/crc64.c @@ -28,6 +28,7 @@ #include "crc64.h" #include "crcspeed.h" +#include "serverassert.h" static uint64_t crc64_table[8][256] = {{0}}; #define POLY UINT64_C(0xad93d23594c935a9) @@ -67,14 +68,33 @@ static uint64_t crc64_table[8][256] = {{0}}; * \return The reflected data. *****************************************************************************/ static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) { - uint_fast64_t ret = data & 0x01; + /* only ever called for data_len == 64 in this codebase + * + * Borrowed from bit twiddling hacks, original in the public domain. + * https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + * Extended to 64 bits, and added byteswap for final 3 steps. + * 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C) + */ - for (size_t i = 1; i < data_len; i++) { - data >>= 1; - ret = (ret << 1) | (data & 0x01); - } - - return ret; + assert(data_len <= 64); + /* swap odd and even bits */ + data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1); + /* swap consecutive pairs */ + data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2); + /* swap nibbles ... */ + data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4); +#if defined(__GNUC__) || defined(__clang__) + data = __builtin_bswap64(data); +#else + /* swap bytes */ + data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8); + /* swap 2-byte long pairs */ + data = ( data >> 16 & 0xFFFF0000FFFFULL) | ((data & 0xFFFF0000FFFFULL) << 16); + /* swap 4-byte quads */ + data = ( data >> 32 & 0xFFFFFFFFULL) | ((data & 0xFFFFFFFFULL) << 32); +#endif + /* adjust for non-64-bit reversals */ + return data >> (64 - data_len); } /** @@ -126,29 +146,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) { #ifdef SERVER_TEST #include +static void genBenchmarkRandomData(char *data, int count); +static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv); +static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv); +long long _ustime(void); + +#include +#include +#include +#include +#include +#include + +#include "zmalloc.h" +#include "crccombine.h" + +long long _ustime(void) { + struct timeval tv; + long long ust; + + gettimeofday(&tv, NULL); + ust = ((long long)tv.tv_sec)*1000000; + ust += tv.tv_usec; + return ust; +} + +static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) { + uint64_t min = size, hash; + long long original_start = _ustime(), original_end; + for (long long i=passes; i > 0; i--) { + hash = crc64(0, data, size); + } + original_end = _ustime(); + min = (original_end - original_start) * 1000 / passes; + /* approximate nanoseconds without nstime */ + if (csv) { + printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n", + name, size, (1000 * size) / min, hash == check); + } else { + printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n", + size, name, (1000 * size) / min, hash == check); + } + return hash != check; +} + +const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5); + +static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) { + uint64_t min = size, start = expect, thash = expect ^ (expect >> 17); + long long original_start = _ustime(), original_end; + for (int i=0; i < 1000; i++) { + crc64_combine(thash, start, size, BENCH_RPOLY, 64); + } + original_end = _ustime(); + /* ran 1000 times, want ns per, counted us per 1000 ... */ + min = original_end - original_start; + if (csv) { + printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min); + } else { + printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min); + } +} + +static void genBenchmarkRandomData(char *data, int count) { + static uint32_t state = 1234; + int i = 0; + + while (count--) { + state = (state*1103515245+12345); + data[i++] = '0'+((state>>16)&63); + } +} + #define UNUSED(x) (void)(x) int crc64Test(int argc, char *argv[], int flags) { - UNUSED(argc); - UNUSED(argv); UNUSED(flags); - crc64_init(); - printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", - (uint64_t)_crc64(0, "123456789", 9)); - printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", - (uint64_t)crc64(0, (unsigned char*)"123456789", 9)); - char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed " - "do eiusmod tempor incididunt ut labore et dolore magna " - "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " - "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " - "aute irure dolor in reprehenderit in voluptate velit esse " - "cillum dolore eu fugiat nulla pariatur. Excepteur sint " - "occaecat cupidatat non proident, sunt in culpa qui officia " - "deserunt mollit anim id est laborum."; - printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n", - (uint64_t)_crc64(0, li, sizeof(li))); - printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n", - (uint64_t)crc64(0, (unsigned char*)li, sizeof(li))); + + uint64_t crc64_test_size = 0; + int i, lastarg, csv = 0, loop = 0, combine = 0; +again: + for (i = 3; i < argc; i++) { + lastarg = (i == (argc-1)); + if (!strcmp(argv[i],"--help")) { + goto usage; + } else if (!strcmp(argv[i],"--csv")) { + csv = 1; + } else if (!strcmp(argv[i],"-l")) { + loop = 1; + } else if (!strcmp(argv[i],"--crc")) { + if (lastarg) goto invalid; + crc64_test_size = atoll(argv[++i]); + } else if (!strcmp(argv[i],"--combine")) { + combine = 1; + } else { +invalid: + printf("Invalid option \"%s\" or option argument missing\n\n",argv[i]); +usage: + printf( +"Usage: crc64 [OPTIONS]\n\n" +" --csv Output in CSV format\n" +" -l Loop. Run the tests forever\n" +" --crc Benchmark crc64 faster options, using a buffer this big, and quit when done.\n" +" --combine Benchmark crc64 combine value ranges and timings.\n" + ); + return 1; + } + } + + if (crc64_test_size == 0 && combine == 0) { + crc64_init(); + printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", + (uint64_t)_crc64(0, "123456789", 9)); + printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", + (uint64_t)crc64(0, (unsigned char*)"123456789", 9)); + char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed " + "do eiusmod tempor incididunt ut labore et dolore magna " + "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + "aute irure dolor in reprehenderit in voluptate velit esse " + "cillum dolore eu fugiat nulla pariatur. Excepteur sint " + "occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum."; + printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n", + (uint64_t)_crc64(0, li, sizeof(li))); + printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n", + (uint64_t)crc64(0, (unsigned char*)li, sizeof(li))); + return 0; + + } + + int init_this_loop = 1; + long long init_start, init_end; + + do { + unsigned char* data = NULL; + uint64_t passes = 0; + if (crc64_test_size) { + data = zmalloc(crc64_test_size); + genBenchmarkRandomData((char*)data, crc64_test_size); + /* We want to hash about 1 gig of data in total, looped, to get a good + * idea of our performance. + */ + passes = (UINT64_C(0x100000000) / crc64_test_size); + passes = passes >= 2 ? passes : 2; + passes = passes <= 1000 ? passes : 1000; + } + + crc64_init(); + /* warm up the cache */ + set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1); + uint64_t expect = crc64(0, data, crc64_test_size); + + if (!combine && crc64_test_size) { + if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n"); + + /* get the single-character version for single-byte Redis behavior */ + set_crc64_cutoffs(0, crc64_test_size+1); + if (bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)) return 1; + + set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1); + /* run with 8-byte "single" path, crcfaster */ + if (bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv)) return 1; + + /* run with dual 8-byte paths */ + set_crc64_cutoffs(1, crc64_test_size+1); + if (bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv)) return 1; + + /* run with tri 8-byte paths */ + set_crc64_cutoffs(1, 1); + if (bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv)) return 1; + + /* Be free memory region, be free. */ + zfree(data); + data = NULL; + } + + uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff); + if (combine) { + if (init_this_loop) { + init_start = _ustime(); + crc64_combine( + UINT64_C(0xdeadbeefdeadbeef), + UINT64_C(0xfeebdaedfeebdaed), + INIT_SIZE, + BENCH_RPOLY, 64); + init_end = _ustime(); + + init_end -= init_start; + init_end *= 1000; + if (csv) { + printf("operation,size,nanoseconds\n"); + printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end); + } else { + printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end); + } + /* use the hash itself as the size (unpredictable) */ + bench_combine("hash_as_size_combine", crc64_test_size, expect, csv); + + /* let's do something big (predictable, so fast) */ + bench_combine("largest_combine", INIT_SIZE, expect, csv); + } + bench_combine("combine", crc64_test_size, expect, csv); + } + init_this_loop = 0; + /* step down by ~1.641 for a range of test sizes */ + crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6); + } while (crc64_test_size > 3); + if (loop) goto again; return 0; } +# endif + + +#ifdef SERVER_TEST_MAIN +int main(int argc, char *argv[]) { + return crc64Test(argc, argv); +} #endif diff --git a/src/crccombine.c b/src/crccombine.c new file mode 100644 index 0000000000..4d9a18c65b --- /dev/null +++ b/src/crccombine.c @@ -0,0 +1,253 @@ +#include +#include +#include +#if defined(__i386__) || defined(__X86_64__) +#include +#endif +#include "crccombine.h" + +/* Copyright (C) 2013 Mark Adler + * Copyright (C) 2019-2024 Josiah Carlson + * Portions originally from: crc64.c Version 1.4 16 Dec 2013 Mark Adler + * Modifications by Josiah Carlson + * - Added implementation variations with sample timings for gf_matrix_times*() + * - Most folks would be best using gf2_matrix_times_vec or + * gf2_matrix_times_vec2, unless some processor does AVX2 fast. + * - This is the implementation of the MERGE_CRC macro defined in + * crcspeed.c (which calls crc_combine()), and is a specialization of the + * generic crc_combine() (and related from the 2013 edition of Mark Adler's + * crc64.c)) for the sake of clarity and performance. + + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu +*/ + +#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0) + +#if !((defined(__i386__) || defined(__X86_64__))) + +/* This cuts 40% of the time vs bit-by-bit. */ + +uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) { + /* + * Without using any vector math, this handles 4 bits at a time, + * and saves 40+% of the time compared to the bit-by-bit version. Use if you + * have no vector compile option available to you. With cache, we see: + * E5-2670 ~1-2us to extend ~1 meg 64 bit hash + */ + uint64_t sum; + + sum = 0; + while (vec) { + /* reversing the case order is ~10% slower on Xeon E5-2670 */ + switch (vec & 15) { + case 15: + sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3); + break; + case 14: + sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3); + break; + case 13: + sum ^= *mat ^ *(mat+2) ^ *(mat+3); + break; + case 12: + sum ^= *(mat+2) ^ *(mat+3); + break; + case 11: + sum ^= *mat ^ *(mat+1) ^ *(mat+3); + break; + case 10: + sum ^= *(mat+1) ^ *(mat+3); + break; + case 9: + sum ^= *mat ^ *(mat+3); + break; + case 8: + sum ^= *(mat+3); + break; + case 7: + sum ^= *mat ^ *(mat+1) ^ *(mat+2); + break; + case 6: + sum ^= *(mat+1) ^ *(mat+2); + break; + case 5: + sum ^= *mat ^ *(mat+2); + break; + case 4: + sum ^= *(mat+2); + break; + case 3: + sum ^= *mat ^ *(mat+1); + break; + case 2: + sum ^= *(mat+1); + break; + case 1: + sum ^= *mat; + break; + default: + break; + } + vec >>= 4; + mat += 4; + } + return sum; +} + +#define CRC_MULTIPLY gf2_matrix_times_switch + +#else + +/* + Warning: here there be dragons involving vector math, and macros to save us + from repeating the same information over and over. +*/ + +uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) { + /* + * Uses xmm registers on x86, works basically everywhere fast, doing + * cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8. + * Is 9-11x faster than original. + * E5-2670 ~29us to extend ~1 meg 64 bit hash + * i3-8130U ~22us to extend ~1 meg 64 bit hash + */ + v2uq sum = {0, 0}, + *mv2 = (v2uq*)mat; + /* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */ + static v2uq masks2[4] = { + {0,0}, + {-1,0}, + {0,-1}, + {-1,-1}, + }; + + /* Almost as beautiful as gf2_matrix_times_vec, but only half as many + * bits per step, so we need 2 per chunk4 operation. Faster in my tests. */ + +#define DO_CHUNK4() \ + sum ^= (*mv2++) & masks2[vec & 3]; \ + vec >>= 2; \ + sum ^= (*mv2++) & masks2[vec & 3]; \ + vec >>= 2 + +#define DO_CHUNK16() \ + DO_CHUNK4(); \ + DO_CHUNK4(); \ + DO_CHUNK4(); \ + DO_CHUNK4() + + DO_CHUNK16(); + DO_CHUNK16(); + DO_CHUNK16(); + DO_CHUNK16(); + + STATIC_ASSERT(sizeof(uint64_t) == 8); + STATIC_ASSERT(sizeof(long long unsigned int) == 8); + return sum[0] ^ sum[1]; +} + +#undef DO_CHUNK16 +#undef DO_CHUNK4 + +#define CRC_MULTIPLY gf2_matrix_times_vec2 +#endif + +static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) { + unsigned n; + + for (n = 0; n < dim; n++) + square[n] = CRC_MULTIPLY(mat, mat[n]); +} + +/* Turns out our Redis / Jones CRC cycles at this point, so we can support + * more than 64 bits of extension if we want. Trivially. */ +static uint64_t combine_cache[64][64]; + +/* Mark Adler has some amazing updates to crc.c in his crcany repository. I + * like static caches, and not worrying about finding cycles generally. We are + * okay to spend the 32k of memory here, leaving the algorithm unchanged from + * as it was a decade ago, and be happy that it costs <200 microseconds to + * init, and that subsequent calls to the combine function take under 100 + * nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and + * we are currently targeting one: Jones CRC64. + */ + +void init_combine_cache(uint64_t poly, uint8_t dim) { + unsigned n, cache_num = 0; + combine_cache[1][0] = poly; + int prev = 1; + uint64_t row = 1; + for (n = 1; n < dim; n++) + { + combine_cache[1][n] = row; + row <<= 1; + } + + gf2_matrix_square(combine_cache[0], combine_cache[1], dim); + gf2_matrix_square(combine_cache[1], combine_cache[0], dim); + + /* do/while to overwrite the first two layers, they are not used, but are + * re-generated in the last two layers for the Redis polynomial */ + do { + gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim); + prev = -1; + } while (++cache_num < 64); +} + +/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the + * first block, crc2 is the CRC-64 of the second block, and len2 is the length + * of the second block. + * + * If you want reflections on your CRCs; do them outside before / after. + * WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST + * ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results. + * You MAY bzero() the even/odd static arrays, which will induce a re-cache on + * next call as a work-around, but ... maybe just parameterize the cached + * models at that point like Mark Adler does in modern crcany/crc.c . + */ + +uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) { + /* degenerate case */ + if (len2 == 0) + return crc1; + + unsigned cache_num = 0; + if (combine_cache[0][0] == 0) { + init_combine_cache(poly, dim); + } + + /* apply len2 zeros to crc1 (first square will put the operator for one + zero byte, eight zero bits, in even) */ + do + { + /* apply zeros operator for this bit of len2 */ + if (len2 & 1) + crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1); + len2 >>= 1; + cache_num = (cache_num + 1) & 63; + /* if no more bits set, then done */ + } while (len2 != 0); + + /* return combined crc */ + crc1 ^= crc2; + return crc1; +} + +#undef CRC_MULTIPLY diff --git a/src/crccombine.h b/src/crccombine.h new file mode 100644 index 0000000000..8da7c5fe6a --- /dev/null +++ b/src/crccombine.h @@ -0,0 +1,10 @@ + +#include + + +/* mask types */ +typedef unsigned long long v2uq __attribute__ ((vector_size (16))); + +uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec); +void init_combine_cache(uint64_t poly, uint8_t dim); +uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim); diff --git a/src/crcspeed.c b/src/crcspeed.c index 9682d8e0be..c7073cba2f 100644 --- a/src/crcspeed.c +++ b/src/crcspeed.c @@ -1,11 +1,21 @@ /* * Copyright (C) 2013 Mark Adler + * Copyright (C) 2019-2024 Josiah Carlson * Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler * Modifications by Matt Stancliff : * - removed CRC64-specific behavior * - added generation of lookup tables by parameters * - removed inversion of CRC input/result * - removed automatic initialization in favor of explicit initialization + * Modifications by Josiah Carlson + * - Added case/vector/AVX/+ versions of crc combine function; see crccombine.c + * - added optional static cache + * - Modified to use 1 thread to: + * - Partition large crc blobs into 2-3 segments + * - Process the 2-3 segments in parallel + * - Merge the resulting crcs + * -> Resulting in 10-90% performance boost for data > 1 meg + * - macro-ized to reduce copy/pasta This software is provided 'as-is', without any express or implied warranty. In no event will the author be held liable for any damages @@ -28,6 +38,10 @@ */ #include "crcspeed.h" +#include "crccombine.h" + +#define CRC64_LEN_MASK UINT64_C(0x7ffffffffffffff8) +#define CRC64_REVERSED_POLY UINT64_C(0x95ac9329ac4bc9b5) /* Fill in a CRC constants table. */ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { @@ -39,7 +53,7 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { table[0][n] = crcfn(0, &v, 1); } - /* generate nested CRC table for future slice-by-8 lookup */ + /* generate nested CRC table for future slice-by-8/16/24+ lookup */ for (int n = 0; n < 256; n++) { crc = table[0][n]; for (int k = 1; k < 8; k++) { @@ -47,6 +61,10 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { table[k][n] = crc; } } +#if USE_STATIC_COMBINE_CACHE + /* initialize combine cache for CRC stapling for slice-by 16/24+ */ + init_combine_cache(CRC64_REVERSED_POLY, 64); +#endif } void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) { @@ -104,45 +122,151 @@ void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) { } } +/* Note: doing all of our crc/next modifications *before* the crc table + * references is an absolute speedup on all CPUs tested. So... keep these + * macros separate. + */ + +#define DO_8_1(crc, next) \ + crc ^= *(uint64_t *)next; \ + next += 8 + +#define DO_8_2(crc) \ + crc = little_table[7][(uint8_t)crc] ^ \ + little_table[6][(uint8_t)(crc >> 8)] ^ \ + little_table[5][(uint8_t)(crc >> 16)] ^ \ + little_table[4][(uint8_t)(crc >> 24)] ^ \ + little_table[3][(uint8_t)(crc >> 32)] ^ \ + little_table[2][(uint8_t)(crc >> 40)] ^ \ + little_table[1][(uint8_t)(crc >> 48)] ^ \ + little_table[0][crc >> 56] + +#define CRC64_SPLIT(div) \ + olen = len; \ + next2 = next1 + ((len / div) & CRC64_LEN_MASK); \ + len = (next2 - next1) + +#define MERGE_CRC(crcn) \ + crc1 = crc64_combine(crc1, crcn, next2 - next1, CRC64_REVERSED_POLY, 64) + +#define MERGE_END(last, DIV) \ + len = olen - ((next2 - next1) * DIV); \ + next1 = last + +/* Variables so we can change for benchmarking; these seem to be fairly + * reasonable for Intel CPUs made since 2010. Please adjust as necessary if + * or when your CPU has more load / execute units. We've written benchmark code + * to help you tune your platform, see crc64Test. */ +#if defined(__i386__) || defined(__X86_64__) +static size_t CRC64_TRI_CUTOFF = (2*1024); +static size_t CRC64_DUAL_CUTOFF = (128); +#else +static size_t CRC64_TRI_CUTOFF = (16*1024); +static size_t CRC64_DUAL_CUTOFF = (1024); +#endif + + +void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff) { + CRC64_DUAL_CUTOFF = dual_cutoff; + CRC64_TRI_CUTOFF = tri_cutoff; +} + /* Calculate a non-inverted CRC multiple bytes at a time on a little-endian * architecture. If you need inverted CRC, invert *before* calling and invert * *after* calling. - * 64 bit crc = process 8 bytes at once; + * 64 bit crc = process 8/16/24 bytes at once; */ -uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc, +uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc1, void *buf, size_t len) { - unsigned char *next = buf; + unsigned char *next1 = buf; + + if (CRC64_DUAL_CUTOFF < 1) { + goto final; + } /* process individual bytes until we reach an 8-byte aligned pointer */ - while (len && ((uintptr_t)next & 7) != 0) { - crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + while (len && ((uintptr_t)next1 & 7) != 0) { + crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8); len--; } - /* fast middle processing, 8 bytes (aligned!) per loop */ + if (len > CRC64_TRI_CUTOFF) { + /* 24 bytes per loop, doing 3 parallel 8 byte chunks at a time */ + unsigned char *next2, *next3; + uint64_t olen, crc2=0, crc3=0; + CRC64_SPLIT(3); + /* len is now the length of the first segment, the 3rd segment possibly + * having extra bytes to clean up at the end + */ + next3 = next2 + len; + while (len >= 8) { + len -= 8; + DO_8_1(crc1, next1); + DO_8_1(crc2, next2); + DO_8_1(crc3, next3); + DO_8_2(crc1); + DO_8_2(crc2); + DO_8_2(crc3); + } + + /* merge the 3 crcs */ + MERGE_CRC(crc2); + MERGE_CRC(crc3); + MERGE_END(next3, 3); + } else if (len > CRC64_DUAL_CUTOFF) { + /* 16 bytes per loop, doing 2 parallel 8 byte chunks at a time */ + unsigned char *next2; + uint64_t olen, crc2=0; + CRC64_SPLIT(2); + /* len is now the length of the first segment, the 2nd segment possibly + * having extra bytes to clean up at the end + */ + while (len >= 8) { + len -= 8; + DO_8_1(crc1, next1); + DO_8_1(crc2, next2); + DO_8_2(crc1); + DO_8_2(crc2); + } + + /* merge the 2 crcs */ + MERGE_CRC(crc2); + MERGE_END(next2, 2); + } + /* We fall through here to handle our = 8) { - crc ^= *(uint64_t *)next; - crc = little_table[7][crc & 0xff] ^ - little_table[6][(crc >> 8) & 0xff] ^ - little_table[5][(crc >> 16) & 0xff] ^ - little_table[4][(crc >> 24) & 0xff] ^ - little_table[3][(crc >> 32) & 0xff] ^ - little_table[2][(crc >> 40) & 0xff] ^ - little_table[1][(crc >> 48) & 0xff] ^ - little_table[0][crc >> 56]; - next += 8; len -= 8; + DO_8_1(crc1, next1); + DO_8_2(crc1); } - +final: /* process remaining bytes (can't be larger than 8) */ while (len) { - crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8); len--; } - return crc; + return crc1; } +/* clean up our namespace */ +#undef DO_8_1 +#undef DO_8_2 +#undef CRC64_SPLIT +#undef MERGE_CRC +#undef MERGE_END +#undef CRC64_REVERSED_POLY +#undef CRC64_LEN_MASK + + +/* note: similar perf advantages can be had for long strings in crc16 using all + * of the same optimizations as above; though this is unnecessary. crc16 is + * normally used to shard keys; not hash / verify data, so is used on shorter + * data that doesn't warrant such changes. */ + uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc, void *buf, size_t len) { unsigned char *next = buf; @@ -190,6 +314,10 @@ uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf, len--; } + /* note: alignment + 2/3-way processing can probably be handled here nearly + the same as above, using our updated DO_8_2 macro. Not included in these + changes, as other authors, I don't have big-endian to test with. */ + while (len >= 8) { crc ^= *(uint64_t *)next; crc = big_table[0][crc & 0xff] ^ diff --git a/src/crcspeed.h b/src/crcspeed.h index d7ee95ebb5..c29f236bc0 100644 --- a/src/crcspeed.h +++ b/src/crcspeed.h @@ -34,6 +34,8 @@ typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t); typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t); +void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff); + /* CRC-64 */ void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]); void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);