1#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
10#![cfg(feature = "x86")]
32#![allow(dead_code)]
33
34#[cfg(target_arch = "x86")]
35use core::arch::x86::*;
36#[cfg(target_arch = "x86_64")]
37use core::arch::x86_64::*;
38
39use crate::unsafe_utils::{transpose, YmmRegister};
40
41const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
42
43pub fn idct_avx2(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
53 unsafe {
54 idct_int_avx2_inner(in_vector, out_vector, stride);
58 }
59}
60
61#[target_feature(enable = "avx2")]
62#[allow(
63 clippy::too_many_lines,
64 clippy::cast_possible_truncation,
65 clippy::similar_names,
66 clippy::op_ref,
67 unused_assignments,
68 clippy::zero_prefixed_literal
69)]
70pub unsafe fn idct_int_avx2_inner(
71 in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
72) {
73 let mut pos = 0;
74
75 let rw0 = _mm256_loadu_si256(in_vector[00..].as_ptr().cast());
81 let rw1 = _mm256_loadu_si256(in_vector[08..].as_ptr().cast());
82 let rw2 = _mm256_loadu_si256(in_vector[16..].as_ptr().cast());
83 let rw3 = _mm256_loadu_si256(in_vector[24..].as_ptr().cast());
84 let rw4 = _mm256_loadu_si256(in_vector[32..].as_ptr().cast());
85 let rw5 = _mm256_loadu_si256(in_vector[40..].as_ptr().cast());
86 let rw6 = _mm256_loadu_si256(in_vector[48..].as_ptr().cast());
87 let rw7 = _mm256_loadu_si256(in_vector[56..].as_ptr().cast());
88
89 let rw8 = _mm256_loadu_si256(in_vector[1..].as_ptr().cast());
102
103 let zero = _mm256_setzero_si256();
104
105 let mut non_zero = 0;
106
107 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw8, zero));
108 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw1, zero));
109 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw2, zero));
110 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw3, zero));
111
112 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw4, zero));
113 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw5, zero));
114 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw6, zero));
115 non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw7, zero));
116
117 if non_zero == -8 {
118 let coeff = ((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16;
121 let idct_value = _mm_set1_epi16(coeff);
122
123 macro_rules! store {
124 ($pos:tt,$value:tt) => {
125 _mm_storeu_si128(
127 out_vector
128 .get_mut($pos..$pos + 8)
129 .unwrap()
130 .as_mut_ptr()
131 .cast(),
132 $value
133 );
134 $pos += stride;
135 };
136 }
137 store!(pos, idct_value);
138 store!(pos, idct_value);
139 store!(pos, idct_value);
140 store!(pos, idct_value);
141
142 store!(pos, idct_value);
143 store!(pos, idct_value);
144 store!(pos, idct_value);
145 store!(pos, idct_value);
146
147 return;
148 }
149
150 let mut row0 = YmmRegister { mm256: rw0 };
151 let mut row1 = YmmRegister { mm256: rw1 };
152 let mut row2 = YmmRegister { mm256: rw2 };
153 let mut row3 = YmmRegister { mm256: rw3 };
154
155 let mut row4 = YmmRegister { mm256: rw4 };
156 let mut row5 = YmmRegister { mm256: rw5 };
157 let mut row6 = YmmRegister { mm256: rw6 };
158 let mut row7 = YmmRegister { mm256: rw7 };
159
160 macro_rules! dct_pass {
161 ($SCALE_BITS:tt,$scale:tt) => {
162 let p1 = (row2 + row6) * 2217;
169
170 let mut t2 = p1 + row6 * -7567;
171 let mut t3 = p1 + row2 * 3135;
172
173 let mut t0 = YmmRegister {
174 mm256: _mm256_slli_epi32((row0 + row4).mm256, 12)
175 };
176 let mut t1 = YmmRegister {
177 mm256: _mm256_slli_epi32((row0 - row4).mm256, 12)
178 };
179
180 let x0 = t0 + t3 + $SCALE_BITS;
181 let x3 = t0 - t3 + $SCALE_BITS;
182 let x1 = t1 + t2 + $SCALE_BITS;
183 let x2 = t1 - t2 + $SCALE_BITS;
184
185 let p3 = row7 + row3;
186 let p4 = row5 + row1;
187 let p1 = row7 + row1;
188 let p2 = row5 + row3;
189 let p5 = (p3 + p4) * 4816;
190
191 t0 = row7 * 1223;
192 t1 = row5 * 8410;
193 t2 = row3 * 12586;
194 t3 = row1 * 6149;
195
196 let p1 = p5 + p1 * -3685;
197 let p2 = p5 + (p2 * -10497);
198 let p3 = p3 * -8034;
199 let p4 = p4 * -1597;
200
201 t3 += p1 + p4;
202 t2 += p2 + p3;
203 t1 += p2 + p4;
204 t0 += p1 + p3;
205
206 row0.mm256 = _mm256_srai_epi32((x0 + t3).mm256, $scale);
207 row1.mm256 = _mm256_srai_epi32((x1 + t2).mm256, $scale);
208 row2.mm256 = _mm256_srai_epi32((x2 + t1).mm256, $scale);
209 row3.mm256 = _mm256_srai_epi32((x3 + t0).mm256, $scale);
210
211 row4.mm256 = _mm256_srai_epi32((x3 - t0).mm256, $scale);
212 row5.mm256 = _mm256_srai_epi32((x2 - t1).mm256, $scale);
213 row6.mm256 = _mm256_srai_epi32((x1 - t2).mm256, $scale);
214 row7.mm256 = _mm256_srai_epi32((x0 - t3).mm256, $scale);
215 };
216 }
217
218 dct_pass!(512, 10);
220 transpose(
221 &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
222 );
223
224 dct_pass!(SCALE_BITS, 17);
226 transpose(
227 &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
228 );
229
230 macro_rules! permute_store {
235 ($x:tt,$y:tt,$index:tt,$out:tt) => {
236 let a = _mm256_packs_epi32($x, $y);
237
238 let b = clamp_avx(a);
240
241 let c = _mm256_permute4x64_epi64(b, shuffle(3, 1, 2, 0));
243
244 _mm_storeu_si128(
246 ($out)
247 .get_mut($index..$index + 8)
248 .unwrap()
249 .as_mut_ptr()
250 .cast(),
251 _mm256_extractf128_si256::<0>(c)
252 );
253 $index += stride;
254 _mm_storeu_si128(
256 ($out)
257 .get_mut($index..$index + 8)
258 .unwrap()
259 .as_mut_ptr()
260 .cast(),
261 _mm256_extractf128_si256::<1>(c)
262 );
263 $index += stride;
264 };
265 }
266 permute_store!((row0.mm256), (row1.mm256), pos, out_vector);
268 permute_store!((row2.mm256), (row3.mm256), pos, out_vector);
269 permute_store!((row4.mm256), (row5.mm256), pos, out_vector);
270 permute_store!((row6.mm256), (row7.mm256), pos, out_vector);
271}
272
273#[inline]
274#[target_feature(enable = "avx2")]
275unsafe fn clamp_avx(reg: __m256i) -> __m256i {
276 let min_s = _mm256_set1_epi16(0);
277 let max_s = _mm256_set1_epi16(255);
278
279 let max_v = _mm256_max_epi16(reg, min_s); let min_v = _mm256_min_epi16(max_v, max_s); return min_v;
282}
283
284#[inline]
287const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
288 ((z << 6) | (y << 4) | (x << 2) | w)
289}