64 bit variant of CrapWow. The implementaion is actually still faster than Murmur2 without inline asm in gcc, but I cannot find out how to support 64bx64b=128b math in icc so it had to be inlined.
finline u64 fastcall CrapWow64( const u8 *key, u64 len, u64 seed ) {
const u64 m = 0x95b47aa3355ba1a1, n = 0x8a970be7488fda55;
#if ( defined(INTEL) || defined(GCC) ) && !defined(MSVC) && defined(X64)
u64 hash;
// 3 = m, 4 = n
// r12 = h, r13 = k, ecx = seed, r12 = key
asm(
"leaq (%%rcx,%4), %%r13\n"
"movq %%rdx, %%r14\n"
"movq %%rcx, %%r15\n"
"movq %%rcx, %%r12\n"
"addq %%rax, %%r13\n"
"andq $0xfffffffffffffff0, %%rcx\n"
"jz QW%=\n"
"addq %%rcx, %%r14\n\n"
"negq %%rcx\n"
"XW%=:\n"
"movq %4, %%rax\n"
"mulq (%%r14,%%rcx)\n"
"xorq %%rax, %%r12\n"
"xorq %%rdx, %%r13\n"
"movq %3, %%rax\n"
"mulq 8(%%r14,%%rcx)\n"
"xorq %%rdx, %%r12\n"
"xorq %%rax, %%r13\n"
"addq $16, %%rcx\n"
"jnz XW%=\n"
"QW%=:\n"
"movq %%r15, %%rcx\n"
"andq $8, %%r15\n"
"jz B%=\n"
"movq %4, %%rax\n"
"mulq (%%r14)\n"
"addq $8, %%r14\n"
"xorq %%rax, %%r12\n"
"xorq %%rdx, %%r13\n"
"B%=:\n"
"andq $7, %%rcx\n"
"jz F%=\n"
"movq $1, %%rdx\n"
"shlq $3, %%rcx\n"
"movq %3, %%rax\n"
"shlq %%cl, %%rdx\n"
"addq $-1, %%rdx\n"
"andq (%%r14), %%rdx\n"
"mulq %%rdx\n"
"xorq %%rdx, %%r12\n"
"xorq %%rax, %%r13\n"
"F%=:\n"
"leaq (%%r13,%4), %%rax\n"
"xorq %%r12, %%rax\n"
"mulq %4\n"
"xorq %%rdx, %%rax\n"
"xorq %%r12, %%rax\n"
"xorq %%r13, %%rax\n"
: "=a"(hash), "=c"(key), "=d"(key)
: "r"(m), "r"(n), "a"(seed), "c"(len), "d"(key)
: "%r12", "%r13", "%r14", "%r15", "cc"
);
return hash;
#elif !defined(INTEL)
#define cwfold( a, b, lo, hi ) { p = (u64)(a) * (u128)(b); lo ^= (u64)p; hi ^= (u64)(p >> 64); }
#define cwmixa( in ) { cwfold( in, m, k, h ); }
#define cwmixb( in ) { cwfold( in, n, h, k ); }
const u64 *key8 = (const u64 *)key;
u64 h = len, k = len + seed + n;
u128 p;
while ( len >= 16 ) { cwmixb(key8[0]) cwmixa(key8[1]) key8 += 2; len -= 16; }
if ( len >= 8 ) { cwmixb(key8[0]) key8 += 1; len -= 8; }
if ( len ) { cwmixa( key8[0] & ( ( (u64)1 << ( len * 8 ) ) - 1 ) ) }
cwmixb( h ^ (k + n) )
return k ^ h;
#else
cannot compile function
#endif
}