CrapWow64

64 bit variant of CrapWow. The implementaion is actually still faster than Murmur2 without inline asm in gcc, but I cannot find out how to support 64bx64b=128b math in icc so it had to be inlined.

Tests

Performance

Implementation

finline u64 fastcall CrapWow64( const u8 *key, u64 len, u64 seed ) {
	const u64 m = 0x95b47aa3355ba1a1, n = 0x8a970be7488fda55;
#if ( defined(INTEL) || defined(GCC) ) && !defined(MSVC) && defined(X64)
	u64 hash;
	// 3 = m, 4 = n
	// r12 = h, r13 = k, ecx = seed, r12 = key
	asm(
		"leaq (%%rcx,%4), %%r13\n"
		"movq %%rdx, %%r14\n"
		"movq %%rcx, %%r15\n"
		"movq %%rcx, %%r12\n"
		"addq %%rax, %%r13\n"
		"andq $0xfffffffffffffff0, %%rcx\n"
		"jz QW%=\n"
		"addq %%rcx, %%r14\n\n"
		"negq %%rcx\n"
	"XW%=:\n"
		"movq %4, %%rax\n"
		"mulq (%%r14,%%rcx)\n"
		"xorq %%rax, %%r12\n"
		"xorq %%rdx, %%r13\n"
		"movq %3, %%rax\n"
		"mulq 8(%%r14,%%rcx)\n"
		"xorq %%rdx, %%r12\n"
		"xorq %%rax, %%r13\n"
		"addq $16, %%rcx\n"
		"jnz XW%=\n"
	"QW%=:\n"
		"movq %%r15, %%rcx\n"
		"andq $8, %%r15\n"
		"jz B%=\n"
		"movq %4, %%rax\n"
		"mulq (%%r14)\n"
		"addq $8, %%r14\n"
		"xorq %%rax, %%r12\n"
		"xorq %%rdx, %%r13\n"
	"B%=:\n"
		"andq $7, %%rcx\n"
		"jz F%=\n"
		"movq $1, %%rdx\n"
		"shlq $3, %%rcx\n"
		"movq %3, %%rax\n"
		"shlq %%cl, %%rdx\n"
		"addq $-1, %%rdx\n"
		"andq (%%r14), %%rdx\n"
		"mulq %%rdx\n"
		"xorq %%rdx, %%r12\n"
		"xorq %%rax, %%r13\n"
	"F%=:\n"
		"leaq (%%r13,%4), %%rax\n"
		"xorq %%r12, %%rax\n"
		"mulq %4\n"
		"xorq %%rdx, %%rax\n"
		"xorq %%r12, %%rax\n"
		"xorq %%r13, %%rax\n"
		: "=a"(hash), "=c"(key), "=d"(key)
		: "r"(m), "r"(n), "a"(seed), "c"(len), "d"(key)
		: "%r12", "%r13", "%r14", "%r15", "cc" 
	);
	return hash;
#elif !defined(INTEL)
	#define cwfold( a, b, lo, hi ) { p = (u64)(a) * (u128)(b); lo ^= (u64)p; hi ^= (u64)(p >> 64); }
	#define cwmixa( in ) { cwfold( in, m, k, h ); }
	#define cwmixb( in ) { cwfold( in, n, h, k ); }

	const u64 *key8 = (const u64 *)key;
	u64 h = len, k = len + seed + n;
	u128 p;

	while ( len >= 16 ) { cwmixb(key8[0]) cwmixa(key8[1]) key8 += 2; len -= 16; }
	if ( len >= 8 ) { cwmixb(key8[0]) key8 += 1; len -= 8; }
	if ( len ) { cwmixa( key8[0] & ( ( (u64)1 << ( len * 8 ) ) - 1 ) ) }
	cwmixb( h ^ (k + n) )
	return k ^ h;
#else
	cannot compile function
#endif

}