1#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32#![cfg(feature = "x86")]
33#![allow(
34 clippy::wildcard_imports,
35 clippy::cast_possible_truncation,
36 clippy::too_many_arguments,
37 clippy::inline_always,
38 clippy::doc_markdown,
39 dead_code
40)]
41
42#[cfg(target_arch = "x86")]
43use core::arch::x86::*;
44#[cfg(target_arch = "x86_64")]
45use core::arch::x86_64::*;
46
47use crate::color_convert::scalar::{CB_CF, CR_CF, C_G_CB_COEF_2, C_G_CR_COEF_1, YUV_RND, Y_CF};
48
49pub union YmmRegister {
50 mm256: __m256i,
52 array: [i16; 16]
54}
55
56const R_AVX_COEF: i32 = i32::from_ne_bytes([CR_CF.to_ne_bytes()[0], CR_CF.to_ne_bytes()[1], 0, 0]);
57const B_AVX_COEF: i32 = i32::from_ne_bytes([0, 0, CB_CF.to_ne_bytes()[0], CB_CF.to_ne_bytes()[1]]);
58const G_COEF_AVX_COEF: i32 = i32::from_ne_bytes([
59 C_G_CR_COEF_1.to_ne_bytes()[0],
60 C_G_CR_COEF_1.to_ne_bytes()[1],
61 C_G_CB_COEF_2.to_ne_bytes()[0],
62 C_G_CB_COEF_2.to_ne_bytes()[1]
63]);
64
65#[inline(always)]
86pub fn ycbcr_to_rgb_avx2(
87 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
88) {
89 unsafe {
92 ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
93 }
94}
95
96#[inline]
97#[target_feature(enable = "avx2")]
98unsafe fn ycbcr_to_rgb_avx2_1(
99 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
100) {
101 let (mut r, mut g, mut b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
102
103 r = _mm256_packus_epi16(r, _mm256_setzero_si256());
104 g = _mm256_packus_epi16(g, _mm256_setzero_si256());
105 b = _mm256_packus_epi16(b, _mm256_setzero_si256());
106
107 r = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(r);
108 g = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(g);
109 b = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(b);
110
111 let sh_r = _mm256_setr_epi8(
112 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
113 9, 4, 15, 10, 5
114 );
115 let sh_g = _mm256_setr_epi8(
116 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
117 14, 9, 4, 15, 10
118 );
119 let sh_b = _mm256_setr_epi8(
120 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
121 3, 14, 9, 4, 15
122 );
123
124 let r0 = _mm256_shuffle_epi8(r, sh_r);
125 let g0 = _mm256_shuffle_epi8(g, sh_g);
126 let b0 = _mm256_shuffle_epi8(b, sh_b);
127
128 let m0 = _mm256_setr_epi8(
129 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
130 0, 0, -1, 0, 0
131 );
132 let m1 = _mm256_setr_epi8(
133 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
134 -1, 0, 0, -1, 0
135 );
136
137 let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
138 let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
139 let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
140
141 let rgb0 = _mm256_permute2x128_si256::<32>(p0, p1);
142 let rgb1 = _mm256_permute2x128_si256::<48>(p2, p0);
143
144 _mm256_storeu_si256(out.as_mut_ptr().cast(), rgb0);
145 _mm_storeu_si128(out[32..].as_mut_ptr().cast(), _mm256_castsi256_si128(rgb1));
146
147 *offset += 48;
148}
149
150#[inline]
152#[target_feature(enable = "avx2")]
153unsafe fn ycbcr_to_rgb_baseline_no_clamp(
159 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
160) -> (__m256i, __m256i, __m256i) {
161 let y_c = _mm256_loadu_si256(y.as_ptr().cast());
164 let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
165 let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
166
167 let y_coeff = _mm256_set1_epi32(i32::from(Y_CF));
174 let cr_coeff = _mm256_set1_epi32(R_AVX_COEF);
175 let cb_coeff = _mm256_set1_epi32(B_AVX_COEF);
176 let cg_coeff = _mm256_set1_epi32(G_COEF_AVX_COEF);
177 let v_rnd = _mm256_set1_epi32(i32::from(YUV_RND));
178 let uv_bias = _mm256_set1_epi16(128);
179
180 let v_0 = _mm256_slli_epi16::<8>(cb_c);
182 let u_v_8 = _mm256_or_si256(v_0, cr_c);
183
184 let mut u_v_lo = _mm256_unpacklo_epi8(u_v_8, _mm256_setzero_si256());
185 let mut u_v_hi = _mm256_unpackhi_epi8(u_v_8, _mm256_setzero_si256());
186
187 let mut y_lo = _mm256_unpacklo_epi16(y_c, _mm256_setzero_si256());
188 let mut y_hi = _mm256_unpackhi_epi16(y_c, _mm256_setzero_si256());
189
190 u_v_lo = _mm256_sub_epi16(u_v_lo, uv_bias);
191 u_v_hi = _mm256_sub_epi16(u_v_hi, uv_bias);
192
193 y_lo = _mm256_madd_epi16(y_lo, y_coeff);
194 y_hi = _mm256_madd_epi16(y_hi, y_coeff);
195
196 let mut r_lo = _mm256_madd_epi16(u_v_lo, cr_coeff);
197 let mut r_hi = _mm256_madd_epi16(u_v_hi, cr_coeff);
198
199 let mut g_lo = _mm256_madd_epi16(u_v_lo, cg_coeff);
200 let mut g_hi = _mm256_madd_epi16(u_v_hi, cg_coeff);
201
202 y_lo = _mm256_add_epi32(y_lo, v_rnd);
205 y_hi = _mm256_add_epi32(y_hi, v_rnd);
206
207 let mut b_lo = _mm256_madd_epi16(u_v_lo, cb_coeff);
208 let mut b_hi = _mm256_madd_epi16(u_v_hi, cb_coeff);
209
210 r_lo = _mm256_add_epi32(r_lo, y_lo);
211 r_hi = _mm256_add_epi32(r_hi, y_hi);
212
213 g_lo = _mm256_add_epi32(g_lo, y_lo);
214 g_hi = _mm256_add_epi32(g_hi, y_hi);
215
216 b_lo = _mm256_add_epi32(b_lo, y_lo);
217 b_hi = _mm256_add_epi32(b_hi, y_hi);
218
219 r_lo = _mm256_srai_epi32::<14>(r_lo);
220 r_hi = _mm256_srai_epi32::<14>(r_hi);
221
222 g_lo = _mm256_srai_epi32::<14>(g_lo);
223 g_hi = _mm256_srai_epi32::<14>(g_hi);
224
225 b_lo = _mm256_srai_epi32::<14>(b_lo);
226 b_hi = _mm256_srai_epi32::<14>(b_hi);
227
228 let r = _mm256_packus_epi32(r_lo, r_hi);
229 let g = _mm256_packus_epi32(g_lo, g_hi);
230 let b = _mm256_packus_epi32(b_lo, b_hi);
231
232 return (r, g, b);
233}
234
235#[inline(always)]
236pub fn ycbcr_to_rgba_avx2(
237 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
238) {
239 unsafe {
240 ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
241 }
242}
243
244#[inline]
245#[target_feature(enable = "avx2")]
246#[rustfmt::skip]
247unsafe fn ycbcr_to_rgba_unsafe(
248 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
249 out: &mut [u8],
250 offset: &mut usize,
251)
252{
253 let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
255
256 let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
257
258 let c = _mm256_packus_epi16(r, g); let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); let e = _mm256_unpacklo_epi8(c, d); let f = _mm256_unpackhi_epi8(c, d); let g = _mm256_unpacklo_epi8(e, f); let h = _mm256_unpackhi_epi8(e, f);
271
272 let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
274
275 let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
276
277 let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
278
279 let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
280
281 let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
282
283 let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
284
285 _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
288
289 _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
290
291 *offset += 64;
292}
293
294#[inline]
295const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
296 (z << 6) | (y << 4) | (x << 2) | w
297}