fn load_8888_tail( tail: usize, data: &[PremultipliedColorU8], r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8, )