unsafe fn gf128_mul_rf(m: __m128i, h: __m128i, d: __m128i) -> __m128i
Complete R/F multiplication with reduction (5 CLMULs total)