fn copy_removing_padding_4x( mcu_block: &[&[i16]; 4], width: usize, padded_width: usize, output: &mut [u8], )