rav1e/transform/
inverse.rs

1// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11  if #[cfg(nasm_x86_64)] {
12    pub use crate::asm::x86::transform::inverse::*;
13  } else if #[cfg(asm_neon)] {
14    pub use crate::asm::aarch64::transform::inverse::*;
15  } else {
16    pub use self::rust::*;
17  }
18}
19
20use crate::tiling::PlaneRegionMut;
21use crate::util::*;
22
23// TODO: move 1d txfm code to rust module.
24
25use super::clamp_value;
26use super::consts::*;
27use super::get_1d_tx_types;
28use super::get_rect_tx_log_ratio;
29use super::half_btf;
30use super::TxSize;
31use super::TxType;
32
33/// # Panics
34///
35/// - If `input` or `output` have fewer than 4 items.
36pub fn av1_iwht4(input: &[i32], output: &mut [i32], _range: usize) {
37  assert!(input.len() >= 4);
38  assert!(output.len() >= 4);
39
40  // <https://aomediacodec.github.io/av1-spec/#inverse-walsh-hadamard-transform-process>
41  let x0 = input[0];
42  let x1 = input[1];
43  let x2 = input[2];
44  let x3 = input[3];
45  let s0 = x0 + x1;
46  let s2 = x2 - x3;
47  let s4 = (s0 - s2) >> 1;
48  let s3 = s4 - x3;
49  let s1 = s4 - x1;
50  output[0] = s0 - s3;
51  output[1] = s3;
52  output[2] = s1;
53  output[3] = s2 + s1;
54}
55
56static COSPI_INV: [i32; 64] = [
57  4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973, 3948,
58  3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564, 3513, 3461,
59  3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896, 2824, 2751, 2675,
60  2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019, 1931, 1842, 1751, 1660,
61  1567, 1474, 1380, 1285, 1189, 1092, 995, 897, 799, 700, 601, 501, 401, 301,
62  201, 101,
63];
64
65static SINPI_INV: [i32; 5] = [0, 1321, 2482, 3344, 3803];
66
67const INV_COS_BIT: usize = 12;
68
69/// # Panics
70///
71/// - If `input` or `output` have fewer than 4 items.
72pub fn av1_idct4(input: &[i32], output: &mut [i32], range: usize) {
73  assert!(input.len() >= 4);
74  assert!(output.len() >= 4);
75
76  // stage 1
77  let stg1 = [input[0], input[2], input[1], input[3]];
78
79  // stage 2
80  let stg2 = [
81    half_btf(COSPI_INV[32], stg1[0], COSPI_INV[32], stg1[1], INV_COS_BIT),
82    half_btf(COSPI_INV[32], stg1[0], -COSPI_INV[32], stg1[1], INV_COS_BIT),
83    half_btf(COSPI_INV[48], stg1[2], -COSPI_INV[16], stg1[3], INV_COS_BIT),
84    half_btf(COSPI_INV[16], stg1[2], COSPI_INV[48], stg1[3], INV_COS_BIT),
85  ];
86
87  // stage 3
88  output[0] = clamp_value(stg2[0] + stg2[3], range);
89  output[1] = clamp_value(stg2[1] + stg2[2], range);
90  output[2] = clamp_value(stg2[1] - stg2[2], range);
91  output[3] = clamp_value(stg2[0] - stg2[3], range);
92}
93
94pub fn av1_iflipadst4(input: &[i32], output: &mut [i32], range: usize) {
95  av1_iadst4(input, output, range);
96  output[..4].reverse();
97}
98
99/// # Panics
100///
101/// - If `input` or `output` have fewer than 4 items.
102#[inline(always)]
103pub fn av1_iadst4(input: &[i32], output: &mut [i32], _range: usize) {
104  assert!(input.len() >= 4);
105  assert!(output.len() >= 4);
106
107  let bit = 12;
108
109  let x0 = input[0];
110  let x1 = input[1];
111  let x2 = input[2];
112  let x3 = input[3];
113
114  // stage 1
115  let s0 = SINPI_INV[1] * x0;
116  let s1 = SINPI_INV[2] * x0;
117  let s2 = SINPI_INV[3] * x1;
118  let s3 = SINPI_INV[4] * x2;
119  let s4 = SINPI_INV[1] * x2;
120  let s5 = SINPI_INV[2] * x3;
121  let s6 = SINPI_INV[4] * x3;
122
123  // stage 2
124  let s7 = (x0 - x2) + x3;
125
126  // stage 3
127  let s0 = s0 + s3;
128  let s1 = s1 - s4;
129  let s3 = s2;
130  let s2 = SINPI_INV[3] * s7;
131
132  // stage 4
133  let s0 = s0 + s5;
134  let s1 = s1 - s6;
135
136  // stage 5
137  let x0 = s0 + s3;
138  let x1 = s1 + s3;
139  let x2 = s2;
140  let x3 = s0 + s1;
141
142  // stage 6
143  let x3 = x3 - s3;
144
145  output[0] = round_shift(x0, bit);
146  output[1] = round_shift(x1, bit);
147  output[2] = round_shift(x2, bit);
148  output[3] = round_shift(x3, bit);
149}
150
151pub fn av1_iidentity4(input: &[i32], output: &mut [i32], _range: usize) {
152  output[..4]
153    .iter_mut()
154    .zip(input[..4].iter())
155    .for_each(|(outp, inp)| *outp = round_shift(SQRT2 * *inp, 12));
156}
157
158/// # Panics
159///
160/// - If `input` or `output` have fewer than 8 items.
161pub fn av1_idct8(input: &[i32], output: &mut [i32], range: usize) {
162  assert!(input.len() >= 8);
163  assert!(output.len() >= 8);
164
165  // call idct4
166  let temp_in = [input[0], input[2], input[4], input[6]];
167  let mut temp_out: [i32; 4] = [0; 4];
168  av1_idct4(&temp_in, &mut temp_out, range);
169
170  // stage 0
171
172  // stage 1
173  let stg1 = [input[1], input[5], input[3], input[7]];
174
175  // stage 2
176  let stg2 = [
177    half_btf(COSPI_INV[56], stg1[0], -COSPI_INV[8], stg1[3], INV_COS_BIT),
178    half_btf(COSPI_INV[24], stg1[1], -COSPI_INV[40], stg1[2], INV_COS_BIT),
179    half_btf(COSPI_INV[40], stg1[1], COSPI_INV[24], stg1[2], INV_COS_BIT),
180    half_btf(COSPI_INV[8], stg1[0], COSPI_INV[56], stg1[3], INV_COS_BIT),
181  ];
182
183  // stage 3
184  let stg3 = [
185    clamp_value(stg2[0] + stg2[1], range),
186    clamp_value(stg2[0] - stg2[1], range),
187    clamp_value(-stg2[2] + stg2[3], range),
188    clamp_value(stg2[2] + stg2[3], range),
189  ];
190
191  // stage 4
192  let stg4 = [
193    stg3[0],
194    half_btf(-COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT),
195    half_btf(COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT),
196    stg3[3],
197  ];
198
199  // stage 5
200  output[0] = clamp_value(temp_out[0] + stg4[3], range);
201  output[1] = clamp_value(temp_out[1] + stg4[2], range);
202  output[2] = clamp_value(temp_out[2] + stg4[1], range);
203  output[3] = clamp_value(temp_out[3] + stg4[0], range);
204  output[4] = clamp_value(temp_out[3] - stg4[0], range);
205  output[5] = clamp_value(temp_out[2] - stg4[1], range);
206  output[6] = clamp_value(temp_out[1] - stg4[2], range);
207  output[7] = clamp_value(temp_out[0] - stg4[3], range);
208}
209
210pub fn av1_iflipadst8(input: &[i32], output: &mut [i32], range: usize) {
211  av1_iadst8(input, output, range);
212  output[..8].reverse();
213}
214
215/// # Panics
216///
217/// - If `input` or `output` have fewer than 8 items.
218#[inline(always)]
219pub fn av1_iadst8(input: &[i32], output: &mut [i32], range: usize) {
220  assert!(input.len() >= 8);
221  assert!(output.len() >= 8);
222
223  // stage 1
224  let stg1 = [
225    input[7], input[0], input[5], input[2], input[3], input[4], input[1],
226    input[6],
227  ];
228
229  // stage 2
230  let stg2 = [
231    half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[1], INV_COS_BIT),
232    half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[1], INV_COS_BIT),
233    half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[3], INV_COS_BIT),
234    half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[3], INV_COS_BIT),
235    half_btf(COSPI_INV[36], stg1[4], COSPI_INV[28], stg1[5], INV_COS_BIT),
236    half_btf(COSPI_INV[28], stg1[4], -COSPI_INV[36], stg1[5], INV_COS_BIT),
237    half_btf(COSPI_INV[52], stg1[6], COSPI_INV[12], stg1[7], INV_COS_BIT),
238    half_btf(COSPI_INV[12], stg1[6], -COSPI_INV[52], stg1[7], INV_COS_BIT),
239  ];
240
241  // stage 3
242  let stg3 = [
243    clamp_value(stg2[0] + stg2[4], range),
244    clamp_value(stg2[1] + stg2[5], range),
245    clamp_value(stg2[2] + stg2[6], range),
246    clamp_value(stg2[3] + stg2[7], range),
247    clamp_value(stg2[0] - stg2[4], range),
248    clamp_value(stg2[1] - stg2[5], range),
249    clamp_value(stg2[2] - stg2[6], range),
250    clamp_value(stg2[3] - stg2[7], range),
251  ];
252
253  // stage 4
254  let stg4 = [
255    stg3[0],
256    stg3[1],
257    stg3[2],
258    stg3[3],
259    half_btf(COSPI_INV[16], stg3[4], COSPI_INV[48], stg3[5], INV_COS_BIT),
260    half_btf(COSPI_INV[48], stg3[4], -COSPI_INV[16], stg3[5], INV_COS_BIT),
261    half_btf(-COSPI_INV[48], stg3[6], COSPI_INV[16], stg3[7], INV_COS_BIT),
262    half_btf(COSPI_INV[16], stg3[6], COSPI_INV[48], stg3[7], INV_COS_BIT),
263  ];
264
265  // stage 5
266  let stg5 = [
267    clamp_value(stg4[0] + stg4[2], range),
268    clamp_value(stg4[1] + stg4[3], range),
269    clamp_value(stg4[0] - stg4[2], range),
270    clamp_value(stg4[1] - stg4[3], range),
271    clamp_value(stg4[4] + stg4[6], range),
272    clamp_value(stg4[5] + stg4[7], range),
273    clamp_value(stg4[4] - stg4[6], range),
274    clamp_value(stg4[5] - stg4[7], range),
275  ];
276
277  // stage 6
278  let stg6 = [
279    stg5[0],
280    stg5[1],
281    half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[3], INV_COS_BIT),
282    half_btf(COSPI_INV[32], stg5[2], -COSPI_INV[32], stg5[3], INV_COS_BIT),
283    stg5[4],
284    stg5[5],
285    half_btf(COSPI_INV[32], stg5[6], COSPI_INV[32], stg5[7], INV_COS_BIT),
286    half_btf(COSPI_INV[32], stg5[6], -COSPI_INV[32], stg5[7], INV_COS_BIT),
287  ];
288
289  // stage 7
290  output[0] = stg6[0];
291  output[1] = -stg6[4];
292  output[2] = stg6[6];
293  output[3] = -stg6[2];
294  output[4] = stg6[3];
295  output[5] = -stg6[7];
296  output[6] = stg6[5];
297  output[7] = -stg6[1];
298}
299
300pub fn av1_iidentity8(input: &[i32], output: &mut [i32], _range: usize) {
301  output[..8]
302    .iter_mut()
303    .zip(input[..8].iter())
304    .for_each(|(outp, inp)| *outp = 2 * *inp);
305}
306
307fn av1_idct16(input: &[i32], output: &mut [i32], range: usize) {
308  assert!(input.len() >= 16);
309  assert!(output.len() >= 16);
310
311  // call idct8
312  let temp_in = [
313    input[0], input[2], input[4], input[6], input[8], input[10], input[12],
314    input[14],
315  ];
316  let mut temp_out: [i32; 8] = [0; 8];
317  av1_idct8(&temp_in, &mut temp_out, range);
318
319  // stage 1
320  let stg1 = [
321    input[1], input[9], input[5], input[13], input[3], input[11], input[7],
322    input[15],
323  ];
324
325  // stage 2
326  let stg2 = [
327    half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[7], INV_COS_BIT),
328    half_btf(COSPI_INV[28], stg1[1], -COSPI_INV[36], stg1[6], INV_COS_BIT),
329    half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[5], INV_COS_BIT),
330    half_btf(COSPI_INV[12], stg1[3], -COSPI_INV[52], stg1[4], INV_COS_BIT),
331    half_btf(COSPI_INV[52], stg1[3], COSPI_INV[12], stg1[4], INV_COS_BIT),
332    half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[5], INV_COS_BIT),
333    half_btf(COSPI_INV[36], stg1[1], COSPI_INV[28], stg1[6], INV_COS_BIT),
334    half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[7], INV_COS_BIT),
335  ];
336
337  // stage 3
338  let stg3 = [
339    clamp_value(stg2[0] + stg2[1], range),
340    clamp_value(stg2[0] - stg2[1], range),
341    clamp_value(-stg2[2] + stg2[3], range),
342    clamp_value(stg2[2] + stg2[3], range),
343    clamp_value(stg2[4] + stg2[5], range),
344    clamp_value(stg2[4] - stg2[5], range),
345    clamp_value(-stg2[6] + stg2[7], range),
346    clamp_value(stg2[6] + stg2[7], range),
347  ];
348
349  // stage 4
350  let stg4 = [
351    stg3[0],
352    half_btf(-COSPI_INV[16], stg3[1], COSPI_INV[48], stg3[6], INV_COS_BIT),
353    half_btf(-COSPI_INV[48], stg3[2], -COSPI_INV[16], stg3[5], INV_COS_BIT),
354    stg3[3],
355    stg3[4],
356    half_btf(-COSPI_INV[16], stg3[2], COSPI_INV[48], stg3[5], INV_COS_BIT),
357    half_btf(COSPI_INV[48], stg3[1], COSPI_INV[16], stg3[6], INV_COS_BIT),
358    stg3[7],
359  ];
360
361  // stage 5
362  let stg5 = [
363    clamp_value(stg4[0] + stg4[3], range),
364    clamp_value(stg4[1] + stg4[2], range),
365    clamp_value(stg4[1] - stg4[2], range),
366    clamp_value(stg4[0] - stg4[3], range),
367    clamp_value(-stg4[4] + stg4[7], range),
368    clamp_value(-stg4[5] + stg4[6], range),
369    clamp_value(stg4[5] + stg4[6], range),
370    clamp_value(stg4[4] + stg4[7], range),
371  ];
372
373  // stage 6
374  let stg6 = [
375    stg5[0],
376    stg5[1],
377    half_btf(-COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT),
378    half_btf(-COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT),
379    half_btf(COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT),
380    half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT),
381    stg5[6],
382    stg5[7],
383  ];
384
385  // stage 7
386  output[0] = clamp_value(temp_out[0] + stg6[7], range);
387  output[1] = clamp_value(temp_out[1] + stg6[6], range);
388  output[2] = clamp_value(temp_out[2] + stg6[5], range);
389  output[3] = clamp_value(temp_out[3] + stg6[4], range);
390  output[4] = clamp_value(temp_out[4] + stg6[3], range);
391  output[5] = clamp_value(temp_out[5] + stg6[2], range);
392  output[6] = clamp_value(temp_out[6] + stg6[1], range);
393  output[7] = clamp_value(temp_out[7] + stg6[0], range);
394  output[8] = clamp_value(temp_out[7] - stg6[0], range);
395  output[9] = clamp_value(temp_out[6] - stg6[1], range);
396  output[10] = clamp_value(temp_out[5] - stg6[2], range);
397  output[11] = clamp_value(temp_out[4] - stg6[3], range);
398  output[12] = clamp_value(temp_out[3] - stg6[4], range);
399  output[13] = clamp_value(temp_out[2] - stg6[5], range);
400  output[14] = clamp_value(temp_out[1] - stg6[6], range);
401  output[15] = clamp_value(temp_out[0] - stg6[7], range);
402}
403
404pub fn av1_iflipadst16(input: &[i32], output: &mut [i32], range: usize) {
405  av1_iadst16(input, output, range);
406  output[..16].reverse();
407}
408
409#[inline(always)]
410fn av1_iadst16(input: &[i32], output: &mut [i32], range: usize) {
411  assert!(input.len() >= 16);
412  assert!(output.len() >= 16);
413
414  // stage 1
415  let stg1 = [
416    input[15], input[0], input[13], input[2], input[11], input[4], input[9],
417    input[6], input[7], input[8], input[5], input[10], input[3], input[12],
418    input[1], input[14],
419  ];
420
421  // stage 2
422  let stg2 = [
423    half_btf(COSPI_INV[2], stg1[0], COSPI_INV[62], stg1[1], INV_COS_BIT),
424    half_btf(COSPI_INV[62], stg1[0], -COSPI_INV[2], stg1[1], INV_COS_BIT),
425    half_btf(COSPI_INV[10], stg1[2], COSPI_INV[54], stg1[3], INV_COS_BIT),
426    half_btf(COSPI_INV[54], stg1[2], -COSPI_INV[10], stg1[3], INV_COS_BIT),
427    half_btf(COSPI_INV[18], stg1[4], COSPI_INV[46], stg1[5], INV_COS_BIT),
428    half_btf(COSPI_INV[46], stg1[4], -COSPI_INV[18], stg1[5], INV_COS_BIT),
429    half_btf(COSPI_INV[26], stg1[6], COSPI_INV[38], stg1[7], INV_COS_BIT),
430    half_btf(COSPI_INV[38], stg1[6], -COSPI_INV[26], stg1[7], INV_COS_BIT),
431    half_btf(COSPI_INV[34], stg1[8], COSPI_INV[30], stg1[9], INV_COS_BIT),
432    half_btf(COSPI_INV[30], stg1[8], -COSPI_INV[34], stg1[9], INV_COS_BIT),
433    half_btf(COSPI_INV[42], stg1[10], COSPI_INV[22], stg1[11], INV_COS_BIT),
434    half_btf(COSPI_INV[22], stg1[10], -COSPI_INV[42], stg1[11], INV_COS_BIT),
435    half_btf(COSPI_INV[50], stg1[12], COSPI_INV[14], stg1[13], INV_COS_BIT),
436    half_btf(COSPI_INV[14], stg1[12], -COSPI_INV[50], stg1[13], INV_COS_BIT),
437    half_btf(COSPI_INV[58], stg1[14], COSPI_INV[6], stg1[15], INV_COS_BIT),
438    half_btf(COSPI_INV[6], stg1[14], -COSPI_INV[58], stg1[15], INV_COS_BIT),
439  ];
440
441  // stage 3
442  let stg3 = [
443    clamp_value(stg2[0] + stg2[8], range),
444    clamp_value(stg2[1] + stg2[9], range),
445    clamp_value(stg2[2] + stg2[10], range),
446    clamp_value(stg2[3] + stg2[11], range),
447    clamp_value(stg2[4] + stg2[12], range),
448    clamp_value(stg2[5] + stg2[13], range),
449    clamp_value(stg2[6] + stg2[14], range),
450    clamp_value(stg2[7] + stg2[15], range),
451    clamp_value(stg2[0] - stg2[8], range),
452    clamp_value(stg2[1] - stg2[9], range),
453    clamp_value(stg2[2] - stg2[10], range),
454    clamp_value(stg2[3] - stg2[11], range),
455    clamp_value(stg2[4] - stg2[12], range),
456    clamp_value(stg2[5] - stg2[13], range),
457    clamp_value(stg2[6] - stg2[14], range),
458    clamp_value(stg2[7] - stg2[15], range),
459  ];
460
461  // stage 4
462  let stg4 = [
463    stg3[0],
464    stg3[1],
465    stg3[2],
466    stg3[3],
467    stg3[4],
468    stg3[5],
469    stg3[6],
470    stg3[7],
471    half_btf(COSPI_INV[8], stg3[8], COSPI_INV[56], stg3[9], INV_COS_BIT),
472    half_btf(COSPI_INV[56], stg3[8], -COSPI_INV[8], stg3[9], INV_COS_BIT),
473    half_btf(COSPI_INV[40], stg3[10], COSPI_INV[24], stg3[11], INV_COS_BIT),
474    half_btf(COSPI_INV[24], stg3[10], -COSPI_INV[40], stg3[11], INV_COS_BIT),
475    half_btf(-COSPI_INV[56], stg3[12], COSPI_INV[8], stg3[13], INV_COS_BIT),
476    half_btf(COSPI_INV[8], stg3[12], COSPI_INV[56], stg3[13], INV_COS_BIT),
477    half_btf(-COSPI_INV[24], stg3[14], COSPI_INV[40], stg3[15], INV_COS_BIT),
478    half_btf(COSPI_INV[40], stg3[14], COSPI_INV[24], stg3[15], INV_COS_BIT),
479  ];
480
481  // stage 5
482  let stg5 = [
483    clamp_value(stg4[0] + stg4[4], range),
484    clamp_value(stg4[1] + stg4[5], range),
485    clamp_value(stg4[2] + stg4[6], range),
486    clamp_value(stg4[3] + stg4[7], range),
487    clamp_value(stg4[0] - stg4[4], range),
488    clamp_value(stg4[1] - stg4[5], range),
489    clamp_value(stg4[2] - stg4[6], range),
490    clamp_value(stg4[3] - stg4[7], range),
491    clamp_value(stg4[8] + stg4[12], range),
492    clamp_value(stg4[9] + stg4[13], range),
493    clamp_value(stg4[10] + stg4[14], range),
494    clamp_value(stg4[11] + stg4[15], range),
495    clamp_value(stg4[8] - stg4[12], range),
496    clamp_value(stg4[9] - stg4[13], range),
497    clamp_value(stg4[10] - stg4[14], range),
498    clamp_value(stg4[11] - stg4[15], range),
499  ];
500
501  // stage 6
502  let stg6 = [
503    stg5[0],
504    stg5[1],
505    stg5[2],
506    stg5[3],
507    half_btf(COSPI_INV[16], stg5[4], COSPI_INV[48], stg5[5], INV_COS_BIT),
508    half_btf(COSPI_INV[48], stg5[4], -COSPI_INV[16], stg5[5], INV_COS_BIT),
509    half_btf(-COSPI_INV[48], stg5[6], COSPI_INV[16], stg5[7], INV_COS_BIT),
510    half_btf(COSPI_INV[16], stg5[6], COSPI_INV[48], stg5[7], INV_COS_BIT),
511    stg5[8],
512    stg5[9],
513    stg5[10],
514    stg5[11],
515    half_btf(COSPI_INV[16], stg5[12], COSPI_INV[48], stg5[13], INV_COS_BIT),
516    half_btf(COSPI_INV[48], stg5[12], -COSPI_INV[16], stg5[13], INV_COS_BIT),
517    half_btf(-COSPI_INV[48], stg5[14], COSPI_INV[16], stg5[15], INV_COS_BIT),
518    half_btf(COSPI_INV[16], stg5[14], COSPI_INV[48], stg5[15], INV_COS_BIT),
519  ];
520
521  // stage 7
522  let stg7 = [
523    clamp_value(stg6[0] + stg6[2], range),
524    clamp_value(stg6[1] + stg6[3], range),
525    clamp_value(stg6[0] - stg6[2], range),
526    clamp_value(stg6[1] - stg6[3], range),
527    clamp_value(stg6[4] + stg6[6], range),
528    clamp_value(stg6[5] + stg6[7], range),
529    clamp_value(stg6[4] - stg6[6], range),
530    clamp_value(stg6[5] - stg6[7], range),
531    clamp_value(stg6[8] + stg6[10], range),
532    clamp_value(stg6[9] + stg6[11], range),
533    clamp_value(stg6[8] - stg6[10], range),
534    clamp_value(stg6[9] - stg6[11], range),
535    clamp_value(stg6[12] + stg6[14], range),
536    clamp_value(stg6[13] + stg6[15], range),
537    clamp_value(stg6[12] - stg6[14], range),
538    clamp_value(stg6[13] - stg6[15], range),
539  ];
540
541  // stage 8
542  let stg8 = [
543    stg7[0],
544    stg7[1],
545    half_btf(COSPI_INV[32], stg7[2], COSPI_INV[32], stg7[3], INV_COS_BIT),
546    half_btf(COSPI_INV[32], stg7[2], -COSPI_INV[32], stg7[3], INV_COS_BIT),
547    stg7[4],
548    stg7[5],
549    half_btf(COSPI_INV[32], stg7[6], COSPI_INV[32], stg7[7], INV_COS_BIT),
550    half_btf(COSPI_INV[32], stg7[6], -COSPI_INV[32], stg7[7], INV_COS_BIT),
551    stg7[8],
552    stg7[9],
553    half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[11], INV_COS_BIT),
554    half_btf(COSPI_INV[32], stg7[10], -COSPI_INV[32], stg7[11], INV_COS_BIT),
555    stg7[12],
556    stg7[13],
557    half_btf(COSPI_INV[32], stg7[14], COSPI_INV[32], stg7[15], INV_COS_BIT),
558    half_btf(COSPI_INV[32], stg7[14], -COSPI_INV[32], stg7[15], INV_COS_BIT),
559  ];
560
561  // stage 9
562  output[0] = stg8[0];
563  output[1] = -stg8[8];
564  output[2] = stg8[12];
565  output[3] = -stg8[4];
566  output[4] = stg8[6];
567  output[5] = -stg8[14];
568  output[6] = stg8[10];
569  output[7] = -stg8[2];
570  output[8] = stg8[3];
571  output[9] = -stg8[11];
572  output[10] = stg8[15];
573  output[11] = -stg8[7];
574  output[12] = stg8[5];
575  output[13] = -stg8[13];
576  output[14] = stg8[9];
577  output[15] = -stg8[1];
578}
579
580fn av1_iidentity16(input: &[i32], output: &mut [i32], _range: usize) {
581  output[..16]
582    .iter_mut()
583    .zip(input[..16].iter())
584    .for_each(|(outp, inp)| *outp = round_shift(SQRT2 * 2 * *inp, 12));
585}
586
587fn av1_idct32(input: &[i32], output: &mut [i32], range: usize) {
588  assert!(input.len() >= 32);
589  assert!(output.len() >= 32);
590
591  // stage 1;
592  let stg1 = [
593    input[0], input[16], input[8], input[24], input[4], input[20], input[12],
594    input[28], input[2], input[18], input[10], input[26], input[6], input[22],
595    input[14], input[30], input[1], input[17], input[9], input[25], input[5],
596    input[21], input[13], input[29], input[3], input[19], input[11],
597    input[27], input[7], input[23], input[15], input[31],
598  ];
599
600  // stage 2
601  let stg2 = [
602    stg1[0],
603    stg1[1],
604    stg1[2],
605    stg1[3],
606    stg1[4],
607    stg1[5],
608    stg1[6],
609    stg1[7],
610    stg1[8],
611    stg1[9],
612    stg1[10],
613    stg1[11],
614    stg1[12],
615    stg1[13],
616    stg1[14],
617    stg1[15],
618    half_btf(COSPI_INV[62], stg1[16], -COSPI_INV[2], stg1[31], INV_COS_BIT),
619    half_btf(COSPI_INV[30], stg1[17], -COSPI_INV[34], stg1[30], INV_COS_BIT),
620    half_btf(COSPI_INV[46], stg1[18], -COSPI_INV[18], stg1[29], INV_COS_BIT),
621    half_btf(COSPI_INV[14], stg1[19], -COSPI_INV[50], stg1[28], INV_COS_BIT),
622    half_btf(COSPI_INV[54], stg1[20], -COSPI_INV[10], stg1[27], INV_COS_BIT),
623    half_btf(COSPI_INV[22], stg1[21], -COSPI_INV[42], stg1[26], INV_COS_BIT),
624    half_btf(COSPI_INV[38], stg1[22], -COSPI_INV[26], stg1[25], INV_COS_BIT),
625    half_btf(COSPI_INV[6], stg1[23], -COSPI_INV[58], stg1[24], INV_COS_BIT),
626    half_btf(COSPI_INV[58], stg1[23], COSPI_INV[6], stg1[24], INV_COS_BIT),
627    half_btf(COSPI_INV[26], stg1[22], COSPI_INV[38], stg1[25], INV_COS_BIT),
628    half_btf(COSPI_INV[42], stg1[21], COSPI_INV[22], stg1[26], INV_COS_BIT),
629    half_btf(COSPI_INV[10], stg1[20], COSPI_INV[54], stg1[27], INV_COS_BIT),
630    half_btf(COSPI_INV[50], stg1[19], COSPI_INV[14], stg1[28], INV_COS_BIT),
631    half_btf(COSPI_INV[18], stg1[18], COSPI_INV[46], stg1[29], INV_COS_BIT),
632    half_btf(COSPI_INV[34], stg1[17], COSPI_INV[30], stg1[30], INV_COS_BIT),
633    half_btf(COSPI_INV[2], stg1[16], COSPI_INV[62], stg1[31], INV_COS_BIT),
634  ];
635
636  // stage 3
637  let stg3 = [
638    stg2[0],
639    stg2[1],
640    stg2[2],
641    stg2[3],
642    stg2[4],
643    stg2[5],
644    stg2[6],
645    stg2[7],
646    half_btf(COSPI_INV[60], stg2[8], -COSPI_INV[4], stg2[15], INV_COS_BIT),
647    half_btf(COSPI_INV[28], stg2[9], -COSPI_INV[36], stg2[14], INV_COS_BIT),
648    half_btf(COSPI_INV[44], stg2[10], -COSPI_INV[20], stg2[13], INV_COS_BIT),
649    half_btf(COSPI_INV[12], stg2[11], -COSPI_INV[52], stg2[12], INV_COS_BIT),
650    half_btf(COSPI_INV[52], stg2[11], COSPI_INV[12], stg2[12], INV_COS_BIT),
651    half_btf(COSPI_INV[20], stg2[10], COSPI_INV[44], stg2[13], INV_COS_BIT),
652    half_btf(COSPI_INV[36], stg2[9], COSPI_INV[28], stg2[14], INV_COS_BIT),
653    half_btf(COSPI_INV[4], stg2[8], COSPI_INV[60], stg2[15], INV_COS_BIT),
654    clamp_value(stg2[16] + stg2[17], range),
655    clamp_value(stg2[16] - stg2[17], range),
656    clamp_value(-stg2[18] + stg2[19], range),
657    clamp_value(stg2[18] + stg2[19], range),
658    clamp_value(stg2[20] + stg2[21], range),
659    clamp_value(stg2[20] - stg2[21], range),
660    clamp_value(-stg2[22] + stg2[23], range),
661    clamp_value(stg2[22] + stg2[23], range),
662    clamp_value(stg2[24] + stg2[25], range),
663    clamp_value(stg2[24] - stg2[25], range),
664    clamp_value(-stg2[26] + stg2[27], range),
665    clamp_value(stg2[26] + stg2[27], range),
666    clamp_value(stg2[28] + stg2[29], range),
667    clamp_value(stg2[28] - stg2[29], range),
668    clamp_value(-stg2[30] + stg2[31], range),
669    clamp_value(stg2[30] + stg2[31], range),
670  ];
671
672  // stage 4
673  let stg4 = [
674    stg3[0],
675    stg3[1],
676    stg3[2],
677    stg3[3],
678    half_btf(COSPI_INV[56], stg3[4], -COSPI_INV[8], stg3[7], INV_COS_BIT),
679    half_btf(COSPI_INV[24], stg3[5], -COSPI_INV[40], stg3[6], INV_COS_BIT),
680    half_btf(COSPI_INV[40], stg3[5], COSPI_INV[24], stg3[6], INV_COS_BIT),
681    half_btf(COSPI_INV[8], stg3[4], COSPI_INV[56], stg3[7], INV_COS_BIT),
682    clamp_value(stg3[8] + stg3[9], range),
683    clamp_value(stg3[8] - stg3[9], range),
684    clamp_value(-stg3[10] + stg3[11], range),
685    clamp_value(stg3[10] + stg3[11], range),
686    clamp_value(stg3[12] + stg3[13], range),
687    clamp_value(stg3[12] - stg3[13], range),
688    clamp_value(-stg3[14] + stg3[15], range),
689    clamp_value(stg3[14] + stg3[15], range),
690    stg3[16],
691    half_btf(-COSPI_INV[8], stg3[17], COSPI_INV[56], stg3[30], INV_COS_BIT),
692    half_btf(-COSPI_INV[56], stg3[18], -COSPI_INV[8], stg3[29], INV_COS_BIT),
693    stg3[19],
694    stg3[20],
695    half_btf(-COSPI_INV[40], stg3[21], COSPI_INV[24], stg3[26], INV_COS_BIT),
696    half_btf(-COSPI_INV[24], stg3[22], -COSPI_INV[40], stg3[25], INV_COS_BIT),
697    stg3[23],
698    stg3[24],
699    half_btf(-COSPI_INV[40], stg3[22], COSPI_INV[24], stg3[25], INV_COS_BIT),
700    half_btf(COSPI_INV[24], stg3[21], COSPI_INV[40], stg3[26], INV_COS_BIT),
701    stg3[27],
702    stg3[28],
703    half_btf(-COSPI_INV[8], stg3[18], COSPI_INV[56], stg3[29], INV_COS_BIT),
704    half_btf(COSPI_INV[56], stg3[17], COSPI_INV[8], stg3[30], INV_COS_BIT),
705    stg3[31],
706  ];
707
708  // stage 5
709  let stg5 = [
710    half_btf(COSPI_INV[32], stg4[0], COSPI_INV[32], stg4[1], INV_COS_BIT),
711    half_btf(COSPI_INV[32], stg4[0], -COSPI_INV[32], stg4[1], INV_COS_BIT),
712    half_btf(COSPI_INV[48], stg4[2], -COSPI_INV[16], stg4[3], INV_COS_BIT),
713    half_btf(COSPI_INV[16], stg4[2], COSPI_INV[48], stg4[3], INV_COS_BIT),
714    clamp_value(stg4[4] + stg4[5], range),
715    clamp_value(stg4[4] - stg4[5], range),
716    clamp_value(-stg4[6] + stg4[7], range),
717    clamp_value(stg4[6] + stg4[7], range),
718    stg4[8],
719    half_btf(-COSPI_INV[16], stg4[9], COSPI_INV[48], stg4[14], INV_COS_BIT),
720    half_btf(-COSPI_INV[48], stg4[10], -COSPI_INV[16], stg4[13], INV_COS_BIT),
721    stg4[11],
722    stg4[12],
723    half_btf(-COSPI_INV[16], stg4[10], COSPI_INV[48], stg4[13], INV_COS_BIT),
724    half_btf(COSPI_INV[48], stg4[9], COSPI_INV[16], stg4[14], INV_COS_BIT),
725    stg4[15],
726    clamp_value(stg4[16] + stg4[19], range),
727    clamp_value(stg4[17] + stg4[18], range),
728    clamp_value(stg4[17] - stg4[18], range),
729    clamp_value(stg4[16] - stg4[19], range),
730    clamp_value(-stg4[20] + stg4[23], range),
731    clamp_value(-stg4[21] + stg4[22], range),
732    clamp_value(stg4[21] + stg4[22], range),
733    clamp_value(stg4[20] + stg4[23], range),
734    clamp_value(stg4[24] + stg4[27], range),
735    clamp_value(stg4[25] + stg4[26], range),
736    clamp_value(stg4[25] - stg4[26], range),
737    clamp_value(stg4[24] - stg4[27], range),
738    clamp_value(-stg4[28] + stg4[31], range),
739    clamp_value(-stg4[29] + stg4[30], range),
740    clamp_value(stg4[29] + stg4[30], range),
741    clamp_value(stg4[28] + stg4[31], range),
742  ];
743
744  // stage 6
745  let stg6 = [
746    clamp_value(stg5[0] + stg5[3], range),
747    clamp_value(stg5[1] + stg5[2], range),
748    clamp_value(stg5[1] - stg5[2], range),
749    clamp_value(stg5[0] - stg5[3], range),
750    stg5[4],
751    half_btf(-COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT),
752    half_btf(COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT),
753    stg5[7],
754    clamp_value(stg5[8] + stg5[11], range),
755    clamp_value(stg5[9] + stg5[10], range),
756    clamp_value(stg5[9] - stg5[10], range),
757    clamp_value(stg5[8] - stg5[11], range),
758    clamp_value(-stg5[12] + stg5[15], range),
759    clamp_value(-stg5[13] + stg5[14], range),
760    clamp_value(stg5[13] + stg5[14], range),
761    clamp_value(stg5[12] + stg5[15], range),
762    stg5[16],
763    stg5[17],
764    half_btf(-COSPI_INV[16], stg5[18], COSPI_INV[48], stg5[29], INV_COS_BIT),
765    half_btf(-COSPI_INV[16], stg5[19], COSPI_INV[48], stg5[28], INV_COS_BIT),
766    half_btf(-COSPI_INV[48], stg5[20], -COSPI_INV[16], stg5[27], INV_COS_BIT),
767    half_btf(-COSPI_INV[48], stg5[21], -COSPI_INV[16], stg5[26], INV_COS_BIT),
768    stg5[22],
769    stg5[23],
770    stg5[24],
771    stg5[25],
772    half_btf(-COSPI_INV[16], stg5[21], COSPI_INV[48], stg5[26], INV_COS_BIT),
773    half_btf(-COSPI_INV[16], stg5[20], COSPI_INV[48], stg5[27], INV_COS_BIT),
774    half_btf(COSPI_INV[48], stg5[19], COSPI_INV[16], stg5[28], INV_COS_BIT),
775    half_btf(COSPI_INV[48], stg5[18], COSPI_INV[16], stg5[29], INV_COS_BIT),
776    stg5[30],
777    stg5[31],
778  ];
779
780  // stage 7
781  let stg7 = [
782    clamp_value(stg6[0] + stg6[7], range),
783    clamp_value(stg6[1] + stg6[6], range),
784    clamp_value(stg6[2] + stg6[5], range),
785    clamp_value(stg6[3] + stg6[4], range),
786    clamp_value(stg6[3] - stg6[4], range),
787    clamp_value(stg6[2] - stg6[5], range),
788    clamp_value(stg6[1] - stg6[6], range),
789    clamp_value(stg6[0] - stg6[7], range),
790    stg6[8],
791    stg6[9],
792    half_btf(-COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT),
793    half_btf(-COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT),
794    half_btf(COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT),
795    half_btf(COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT),
796    stg6[14],
797    stg6[15],
798    clamp_value(stg6[16] + stg6[23], range),
799    clamp_value(stg6[17] + stg6[22], range),
800    clamp_value(stg6[18] + stg6[21], range),
801    clamp_value(stg6[19] + stg6[20], range),
802    clamp_value(stg6[19] - stg6[20], range),
803    clamp_value(stg6[18] - stg6[21], range),
804    clamp_value(stg6[17] - stg6[22], range),
805    clamp_value(stg6[16] - stg6[23], range),
806    clamp_value(-stg6[24] + stg6[31], range),
807    clamp_value(-stg6[25] + stg6[30], range),
808    clamp_value(-stg6[26] + stg6[29], range),
809    clamp_value(-stg6[27] + stg6[28], range),
810    clamp_value(stg6[27] + stg6[28], range),
811    clamp_value(stg6[26] + stg6[29], range),
812    clamp_value(stg6[25] + stg6[30], range),
813    clamp_value(stg6[24] + stg6[31], range),
814  ];
815
816  // stage 8
817  let stg8 = [
818    clamp_value(stg7[0] + stg7[15], range),
819    clamp_value(stg7[1] + stg7[14], range),
820    clamp_value(stg7[2] + stg7[13], range),
821    clamp_value(stg7[3] + stg7[12], range),
822    clamp_value(stg7[4] + stg7[11], range),
823    clamp_value(stg7[5] + stg7[10], range),
824    clamp_value(stg7[6] + stg7[9], range),
825    clamp_value(stg7[7] + stg7[8], range),
826    clamp_value(stg7[7] - stg7[8], range),
827    clamp_value(stg7[6] - stg7[9], range),
828    clamp_value(stg7[5] - stg7[10], range),
829    clamp_value(stg7[4] - stg7[11], range),
830    clamp_value(stg7[3] - stg7[12], range),
831    clamp_value(stg7[2] - stg7[13], range),
832    clamp_value(stg7[1] - stg7[14], range),
833    clamp_value(stg7[0] - stg7[15], range),
834    stg7[16],
835    stg7[17],
836    stg7[18],
837    stg7[19],
838    half_btf(-COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT),
839    half_btf(-COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT),
840    half_btf(-COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT),
841    half_btf(-COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT),
842    half_btf(COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT),
843    half_btf(COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT),
844    half_btf(COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT),
845    half_btf(COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT),
846    stg7[28],
847    stg7[29],
848    stg7[30],
849    stg7[31],
850  ];
851
852  // stage 9
853  output[0] = clamp_value(stg8[0] + stg8[31], range);
854  output[1] = clamp_value(stg8[1] + stg8[30], range);
855  output[2] = clamp_value(stg8[2] + stg8[29], range);
856  output[3] = clamp_value(stg8[3] + stg8[28], range);
857  output[4] = clamp_value(stg8[4] + stg8[27], range);
858  output[5] = clamp_value(stg8[5] + stg8[26], range);
859  output[6] = clamp_value(stg8[6] + stg8[25], range);
860  output[7] = clamp_value(stg8[7] + stg8[24], range);
861  output[8] = clamp_value(stg8[8] + stg8[23], range);
862  output[9] = clamp_value(stg8[9] + stg8[22], range);
863  output[10] = clamp_value(stg8[10] + stg8[21], range);
864  output[11] = clamp_value(stg8[11] + stg8[20], range);
865  output[12] = clamp_value(stg8[12] + stg8[19], range);
866  output[13] = clamp_value(stg8[13] + stg8[18], range);
867  output[14] = clamp_value(stg8[14] + stg8[17], range);
868  output[15] = clamp_value(stg8[15] + stg8[16], range);
869  output[16] = clamp_value(stg8[15] - stg8[16], range);
870  output[17] = clamp_value(stg8[14] - stg8[17], range);
871  output[18] = clamp_value(stg8[13] - stg8[18], range);
872  output[19] = clamp_value(stg8[12] - stg8[19], range);
873  output[20] = clamp_value(stg8[11] - stg8[20], range);
874  output[21] = clamp_value(stg8[10] - stg8[21], range);
875  output[22] = clamp_value(stg8[9] - stg8[22], range);
876  output[23] = clamp_value(stg8[8] - stg8[23], range);
877  output[24] = clamp_value(stg8[7] - stg8[24], range);
878  output[25] = clamp_value(stg8[6] - stg8[25], range);
879  output[26] = clamp_value(stg8[5] - stg8[26], range);
880  output[27] = clamp_value(stg8[4] - stg8[27], range);
881  output[28] = clamp_value(stg8[3] - stg8[28], range);
882  output[29] = clamp_value(stg8[2] - stg8[29], range);
883  output[30] = clamp_value(stg8[1] - stg8[30], range);
884  output[31] = clamp_value(stg8[0] - stg8[31], range);
885}
886
887fn av1_iidentity32(input: &[i32], output: &mut [i32], _range: usize) {
888  output[..32]
889    .iter_mut()
890    .zip(input[..32].iter())
891    .for_each(|(outp, inp)| *outp = 4 * *inp);
892}
893
894fn av1_idct64(input: &[i32], output: &mut [i32], range: usize) {
895  assert!(input.len() >= 64);
896  assert!(output.len() >= 64);
897
898  // stage 1;
899  let stg1 = [
900    input[0], input[32], input[16], input[48], input[8], input[40], input[24],
901    input[56], input[4], input[36], input[20], input[52], input[12],
902    input[44], input[28], input[60], input[2], input[34], input[18],
903    input[50], input[10], input[42], input[26], input[58], input[6],
904    input[38], input[22], input[54], input[14], input[46], input[30],
905    input[62], input[1], input[33], input[17], input[49], input[9], input[41],
906    input[25], input[57], input[5], input[37], input[21], input[53],
907    input[13], input[45], input[29], input[61], input[3], input[35],
908    input[19], input[51], input[11], input[43], input[27], input[59],
909    input[7], input[39], input[23], input[55], input[15], input[47],
910    input[31], input[63],
911  ];
912
913  // stage 2
914  let stg2 = [
915    stg1[0],
916    stg1[1],
917    stg1[2],
918    stg1[3],
919    stg1[4],
920    stg1[5],
921    stg1[6],
922    stg1[7],
923    stg1[8],
924    stg1[9],
925    stg1[10],
926    stg1[11],
927    stg1[12],
928    stg1[13],
929    stg1[14],
930    stg1[15],
931    stg1[16],
932    stg1[17],
933    stg1[18],
934    stg1[19],
935    stg1[20],
936    stg1[21],
937    stg1[22],
938    stg1[23],
939    stg1[24],
940    stg1[25],
941    stg1[26],
942    stg1[27],
943    stg1[28],
944    stg1[29],
945    stg1[30],
946    stg1[31],
947    half_btf(COSPI_INV[63], stg1[32], -COSPI_INV[1], stg1[63], INV_COS_BIT),
948    half_btf(COSPI_INV[31], stg1[33], -COSPI_INV[33], stg1[62], INV_COS_BIT),
949    half_btf(COSPI_INV[47], stg1[34], -COSPI_INV[17], stg1[61], INV_COS_BIT),
950    half_btf(COSPI_INV[15], stg1[35], -COSPI_INV[49], stg1[60], INV_COS_BIT),
951    half_btf(COSPI_INV[55], stg1[36], -COSPI_INV[9], stg1[59], INV_COS_BIT),
952    half_btf(COSPI_INV[23], stg1[37], -COSPI_INV[41], stg1[58], INV_COS_BIT),
953    half_btf(COSPI_INV[39], stg1[38], -COSPI_INV[25], stg1[57], INV_COS_BIT),
954    half_btf(COSPI_INV[7], stg1[39], -COSPI_INV[57], stg1[56], INV_COS_BIT),
955    half_btf(COSPI_INV[59], stg1[40], -COSPI_INV[5], stg1[55], INV_COS_BIT),
956    half_btf(COSPI_INV[27], stg1[41], -COSPI_INV[37], stg1[54], INV_COS_BIT),
957    half_btf(COSPI_INV[43], stg1[42], -COSPI_INV[21], stg1[53], INV_COS_BIT),
958    half_btf(COSPI_INV[11], stg1[43], -COSPI_INV[53], stg1[52], INV_COS_BIT),
959    half_btf(COSPI_INV[51], stg1[44], -COSPI_INV[13], stg1[51], INV_COS_BIT),
960    half_btf(COSPI_INV[19], stg1[45], -COSPI_INV[45], stg1[50], INV_COS_BIT),
961    half_btf(COSPI_INV[35], stg1[46], -COSPI_INV[29], stg1[49], INV_COS_BIT),
962    half_btf(COSPI_INV[3], stg1[47], -COSPI_INV[61], stg1[48], INV_COS_BIT),
963    half_btf(COSPI_INV[61], stg1[47], COSPI_INV[3], stg1[48], INV_COS_BIT),
964    half_btf(COSPI_INV[29], stg1[46], COSPI_INV[35], stg1[49], INV_COS_BIT),
965    half_btf(COSPI_INV[45], stg1[45], COSPI_INV[19], stg1[50], INV_COS_BIT),
966    half_btf(COSPI_INV[13], stg1[44], COSPI_INV[51], stg1[51], INV_COS_BIT),
967    half_btf(COSPI_INV[53], stg1[43], COSPI_INV[11], stg1[52], INV_COS_BIT),
968    half_btf(COSPI_INV[21], stg1[42], COSPI_INV[43], stg1[53], INV_COS_BIT),
969    half_btf(COSPI_INV[37], stg1[41], COSPI_INV[27], stg1[54], INV_COS_BIT),
970    half_btf(COSPI_INV[5], stg1[40], COSPI_INV[59], stg1[55], INV_COS_BIT),
971    half_btf(COSPI_INV[57], stg1[39], COSPI_INV[7], stg1[56], INV_COS_BIT),
972    half_btf(COSPI_INV[25], stg1[38], COSPI_INV[39], stg1[57], INV_COS_BIT),
973    half_btf(COSPI_INV[41], stg1[37], COSPI_INV[23], stg1[58], INV_COS_BIT),
974    half_btf(COSPI_INV[9], stg1[36], COSPI_INV[55], stg1[59], INV_COS_BIT),
975    half_btf(COSPI_INV[49], stg1[35], COSPI_INV[15], stg1[60], INV_COS_BIT),
976    half_btf(COSPI_INV[17], stg1[34], COSPI_INV[47], stg1[61], INV_COS_BIT),
977    half_btf(COSPI_INV[33], stg1[33], COSPI_INV[31], stg1[62], INV_COS_BIT),
978    half_btf(COSPI_INV[1], stg1[32], COSPI_INV[63], stg1[63], INV_COS_BIT),
979  ];
980
981  // stage 3
982  let stg3 = [
983    stg2[0],
984    stg2[1],
985    stg2[2],
986    stg2[3],
987    stg2[4],
988    stg2[5],
989    stg2[6],
990    stg2[7],
991    stg2[8],
992    stg2[9],
993    stg2[10],
994    stg2[11],
995    stg2[12],
996    stg2[13],
997    stg2[14],
998    stg2[15],
999    half_btf(COSPI_INV[62], stg2[16], -COSPI_INV[2], stg2[31], INV_COS_BIT),
1000    half_btf(COSPI_INV[30], stg2[17], -COSPI_INV[34], stg2[30], INV_COS_BIT),
1001    half_btf(COSPI_INV[46], stg2[18], -COSPI_INV[18], stg2[29], INV_COS_BIT),
1002    half_btf(COSPI_INV[14], stg2[19], -COSPI_INV[50], stg2[28], INV_COS_BIT),
1003    half_btf(COSPI_INV[54], stg2[20], -COSPI_INV[10], stg2[27], INV_COS_BIT),
1004    half_btf(COSPI_INV[22], stg2[21], -COSPI_INV[42], stg2[26], INV_COS_BIT),
1005    half_btf(COSPI_INV[38], stg2[22], -COSPI_INV[26], stg2[25], INV_COS_BIT),
1006    half_btf(COSPI_INV[6], stg2[23], -COSPI_INV[58], stg2[24], INV_COS_BIT),
1007    half_btf(COSPI_INV[58], stg2[23], COSPI_INV[6], stg2[24], INV_COS_BIT),
1008    half_btf(COSPI_INV[26], stg2[22], COSPI_INV[38], stg2[25], INV_COS_BIT),
1009    half_btf(COSPI_INV[42], stg2[21], COSPI_INV[22], stg2[26], INV_COS_BIT),
1010    half_btf(COSPI_INV[10], stg2[20], COSPI_INV[54], stg2[27], INV_COS_BIT),
1011    half_btf(COSPI_INV[50], stg2[19], COSPI_INV[14], stg2[28], INV_COS_BIT),
1012    half_btf(COSPI_INV[18], stg2[18], COSPI_INV[46], stg2[29], INV_COS_BIT),
1013    half_btf(COSPI_INV[34], stg2[17], COSPI_INV[30], stg2[30], INV_COS_BIT),
1014    half_btf(COSPI_INV[2], stg2[16], COSPI_INV[62], stg2[31], INV_COS_BIT),
1015    clamp_value(stg2[32] + stg2[33], range),
1016    clamp_value(stg2[32] - stg2[33], range),
1017    clamp_value(-stg2[34] + stg2[35], range),
1018    clamp_value(stg2[34] + stg2[35], range),
1019    clamp_value(stg2[36] + stg2[37], range),
1020    clamp_value(stg2[36] - stg2[37], range),
1021    clamp_value(-stg2[38] + stg2[39], range),
1022    clamp_value(stg2[38] + stg2[39], range),
1023    clamp_value(stg2[40] + stg2[41], range),
1024    clamp_value(stg2[40] - stg2[41], range),
1025    clamp_value(-stg2[42] + stg2[43], range),
1026    clamp_value(stg2[42] + stg2[43], range),
1027    clamp_value(stg2[44] + stg2[45], range),
1028    clamp_value(stg2[44] - stg2[45], range),
1029    clamp_value(-stg2[46] + stg2[47], range),
1030    clamp_value(stg2[46] + stg2[47], range),
1031    clamp_value(stg2[48] + stg2[49], range),
1032    clamp_value(stg2[48] - stg2[49], range),
1033    clamp_value(-stg2[50] + stg2[51], range),
1034    clamp_value(stg2[50] + stg2[51], range),
1035    clamp_value(stg2[52] + stg2[53], range),
1036    clamp_value(stg2[52] - stg2[53], range),
1037    clamp_value(-stg2[54] + stg2[55], range),
1038    clamp_value(stg2[54] + stg2[55], range),
1039    clamp_value(stg2[56] + stg2[57], range),
1040    clamp_value(stg2[56] - stg2[57], range),
1041    clamp_value(-stg2[58] + stg2[59], range),
1042    clamp_value(stg2[58] + stg2[59], range),
1043    clamp_value(stg2[60] + stg2[61], range),
1044    clamp_value(stg2[60] - stg2[61], range),
1045    clamp_value(-stg2[62] + stg2[63], range),
1046    clamp_value(stg2[62] + stg2[63], range),
1047  ];
1048
1049  // stage 4
1050  let stg4 = [
1051    stg3[0],
1052    stg3[1],
1053    stg3[2],
1054    stg3[3],
1055    stg3[4],
1056    stg3[5],
1057    stg3[6],
1058    stg3[7],
1059    half_btf(COSPI_INV[60], stg3[8], -COSPI_INV[4], stg3[15], INV_COS_BIT),
1060    half_btf(COSPI_INV[28], stg3[9], -COSPI_INV[36], stg3[14], INV_COS_BIT),
1061    half_btf(COSPI_INV[44], stg3[10], -COSPI_INV[20], stg3[13], INV_COS_BIT),
1062    half_btf(COSPI_INV[12], stg3[11], -COSPI_INV[52], stg3[12], INV_COS_BIT),
1063    half_btf(COSPI_INV[52], stg3[11], COSPI_INV[12], stg3[12], INV_COS_BIT),
1064    half_btf(COSPI_INV[20], stg3[10], COSPI_INV[44], stg3[13], INV_COS_BIT),
1065    half_btf(COSPI_INV[36], stg3[9], COSPI_INV[28], stg3[14], INV_COS_BIT),
1066    half_btf(COSPI_INV[4], stg3[8], COSPI_INV[60], stg3[15], INV_COS_BIT),
1067    clamp_value(stg3[16] + stg3[17], range),
1068    clamp_value(stg3[16] - stg3[17], range),
1069    clamp_value(-stg3[18] + stg3[19], range),
1070    clamp_value(stg3[18] + stg3[19], range),
1071    clamp_value(stg3[20] + stg3[21], range),
1072    clamp_value(stg3[20] - stg3[21], range),
1073    clamp_value(-stg3[22] + stg3[23], range),
1074    clamp_value(stg3[22] + stg3[23], range),
1075    clamp_value(stg3[24] + stg3[25], range),
1076    clamp_value(stg3[24] - stg3[25], range),
1077    clamp_value(-stg3[26] + stg3[27], range),
1078    clamp_value(stg3[26] + stg3[27], range),
1079    clamp_value(stg3[28] + stg3[29], range),
1080    clamp_value(stg3[28] - stg3[29], range),
1081    clamp_value(-stg3[30] + stg3[31], range),
1082    clamp_value(stg3[30] + stg3[31], range),
1083    stg3[32],
1084    half_btf(-COSPI_INV[4], stg3[33], COSPI_INV[60], stg3[62], INV_COS_BIT),
1085    half_btf(-COSPI_INV[60], stg3[34], -COSPI_INV[4], stg3[61], INV_COS_BIT),
1086    stg3[35],
1087    stg3[36],
1088    half_btf(-COSPI_INV[36], stg3[37], COSPI_INV[28], stg3[58], INV_COS_BIT),
1089    half_btf(-COSPI_INV[28], stg3[38], -COSPI_INV[36], stg3[57], INV_COS_BIT),
1090    stg3[39],
1091    stg3[40],
1092    half_btf(-COSPI_INV[20], stg3[41], COSPI_INV[44], stg3[54], INV_COS_BIT),
1093    half_btf(-COSPI_INV[44], stg3[42], -COSPI_INV[20], stg3[53], INV_COS_BIT),
1094    stg3[43],
1095    stg3[44],
1096    half_btf(-COSPI_INV[52], stg3[45], COSPI_INV[12], stg3[50], INV_COS_BIT),
1097    half_btf(-COSPI_INV[12], stg3[46], -COSPI_INV[52], stg3[49], INV_COS_BIT),
1098    stg3[47],
1099    stg3[48],
1100    half_btf(-COSPI_INV[52], stg3[46], COSPI_INV[12], stg3[49], INV_COS_BIT),
1101    half_btf(COSPI_INV[12], stg3[45], COSPI_INV[52], stg3[50], INV_COS_BIT),
1102    stg3[51],
1103    stg3[52],
1104    half_btf(-COSPI_INV[20], stg3[42], COSPI_INV[44], stg3[53], INV_COS_BIT),
1105    half_btf(COSPI_INV[44], stg3[41], COSPI_INV[20], stg3[54], INV_COS_BIT),
1106    stg3[55],
1107    stg3[56],
1108    half_btf(-COSPI_INV[36], stg3[38], COSPI_INV[28], stg3[57], INV_COS_BIT),
1109    half_btf(COSPI_INV[28], stg3[37], COSPI_INV[36], stg3[58], INV_COS_BIT),
1110    stg3[59],
1111    stg3[60],
1112    half_btf(-COSPI_INV[4], stg3[34], COSPI_INV[60], stg3[61], INV_COS_BIT),
1113    half_btf(COSPI_INV[60], stg3[33], COSPI_INV[4], stg3[62], INV_COS_BIT),
1114    stg3[63],
1115  ];
1116
1117  // stage 5
1118  let stg5 = [
1119    stg4[0],
1120    stg4[1],
1121    stg4[2],
1122    stg4[3],
1123    half_btf(COSPI_INV[56], stg4[4], -COSPI_INV[8], stg4[7], INV_COS_BIT),
1124    half_btf(COSPI_INV[24], stg4[5], -COSPI_INV[40], stg4[6], INV_COS_BIT),
1125    half_btf(COSPI_INV[40], stg4[5], COSPI_INV[24], stg4[6], INV_COS_BIT),
1126    half_btf(COSPI_INV[8], stg4[4], COSPI_INV[56], stg4[7], INV_COS_BIT),
1127    clamp_value(stg4[8] + stg4[9], range),
1128    clamp_value(stg4[8] - stg4[9], range),
1129    clamp_value(-stg4[10] + stg4[11], range),
1130    clamp_value(stg4[10] + stg4[11], range),
1131    clamp_value(stg4[12] + stg4[13], range),
1132    clamp_value(stg4[12] - stg4[13], range),
1133    clamp_value(-stg4[14] + stg4[15], range),
1134    clamp_value(stg4[14] + stg4[15], range),
1135    stg4[16],
1136    half_btf(-COSPI_INV[8], stg4[17], COSPI_INV[56], stg4[30], INV_COS_BIT),
1137    half_btf(-COSPI_INV[56], stg4[18], -COSPI_INV[8], stg4[29], INV_COS_BIT),
1138    stg4[19],
1139    stg4[20],
1140    half_btf(-COSPI_INV[40], stg4[21], COSPI_INV[24], stg4[26], INV_COS_BIT),
1141    half_btf(-COSPI_INV[24], stg4[22], -COSPI_INV[40], stg4[25], INV_COS_BIT),
1142    stg4[23],
1143    stg4[24],
1144    half_btf(-COSPI_INV[40], stg4[22], COSPI_INV[24], stg4[25], INV_COS_BIT),
1145    half_btf(COSPI_INV[24], stg4[21], COSPI_INV[40], stg4[26], INV_COS_BIT),
1146    stg4[27],
1147    stg4[28],
1148    half_btf(-COSPI_INV[8], stg4[18], COSPI_INV[56], stg4[29], INV_COS_BIT),
1149    half_btf(COSPI_INV[56], stg4[17], COSPI_INV[8], stg4[30], INV_COS_BIT),
1150    stg4[31],
1151    clamp_value(stg4[32] + stg4[35], range),
1152    clamp_value(stg4[33] + stg4[34], range),
1153    clamp_value(stg4[33] - stg4[34], range),
1154    clamp_value(stg4[32] - stg4[35], range),
1155    clamp_value(-stg4[36] + stg4[39], range),
1156    clamp_value(-stg4[37] + stg4[38], range),
1157    clamp_value(stg4[37] + stg4[38], range),
1158    clamp_value(stg4[36] + stg4[39], range),
1159    clamp_value(stg4[40] + stg4[43], range),
1160    clamp_value(stg4[41] + stg4[42], range),
1161    clamp_value(stg4[41] - stg4[42], range),
1162    clamp_value(stg4[40] - stg4[43], range),
1163    clamp_value(-stg4[44] + stg4[47], range),
1164    clamp_value(-stg4[45] + stg4[46], range),
1165    clamp_value(stg4[45] + stg4[46], range),
1166    clamp_value(stg4[44] + stg4[47], range),
1167    clamp_value(stg4[48] + stg4[51], range),
1168    clamp_value(stg4[49] + stg4[50], range),
1169    clamp_value(stg4[49] - stg4[50], range),
1170    clamp_value(stg4[48] - stg4[51], range),
1171    clamp_value(-stg4[52] + stg4[55], range),
1172    clamp_value(-stg4[53] + stg4[54], range),
1173    clamp_value(stg4[53] + stg4[54], range),
1174    clamp_value(stg4[52] + stg4[55], range),
1175    clamp_value(stg4[56] + stg4[59], range),
1176    clamp_value(stg4[57] + stg4[58], range),
1177    clamp_value(stg4[57] - stg4[58], range),
1178    clamp_value(stg4[56] - stg4[59], range),
1179    clamp_value(-stg4[60] + stg4[63], range),
1180    clamp_value(-stg4[61] + stg4[62], range),
1181    clamp_value(stg4[61] + stg4[62], range),
1182    clamp_value(stg4[60] + stg4[63], range),
1183  ];
1184
1185  // stage 6
1186  let stg6 = [
1187    half_btf(COSPI_INV[32], stg5[0], COSPI_INV[32], stg5[1], INV_COS_BIT),
1188    half_btf(COSPI_INV[32], stg5[0], -COSPI_INV[32], stg5[1], INV_COS_BIT),
1189    half_btf(COSPI_INV[48], stg5[2], -COSPI_INV[16], stg5[3], INV_COS_BIT),
1190    half_btf(COSPI_INV[16], stg5[2], COSPI_INV[48], stg5[3], INV_COS_BIT),
1191    clamp_value(stg5[4] + stg5[5], range),
1192    clamp_value(stg5[4] - stg5[5], range),
1193    clamp_value(-stg5[6] + stg5[7], range),
1194    clamp_value(stg5[6] + stg5[7], range),
1195    stg5[8],
1196    half_btf(-COSPI_INV[16], stg5[9], COSPI_INV[48], stg5[14], INV_COS_BIT),
1197    half_btf(-COSPI_INV[48], stg5[10], -COSPI_INV[16], stg5[13], INV_COS_BIT),
1198    stg5[11],
1199    stg5[12],
1200    half_btf(-COSPI_INV[16], stg5[10], COSPI_INV[48], stg5[13], INV_COS_BIT),
1201    half_btf(COSPI_INV[48], stg5[9], COSPI_INV[16], stg5[14], INV_COS_BIT),
1202    stg5[15],
1203    clamp_value(stg5[16] + stg5[19], range),
1204    clamp_value(stg5[17] + stg5[18], range),
1205    clamp_value(stg5[17] - stg5[18], range),
1206    clamp_value(stg5[16] - stg5[19], range),
1207    clamp_value(-stg5[20] + stg5[23], range),
1208    clamp_value(-stg5[21] + stg5[22], range),
1209    clamp_value(stg5[21] + stg5[22], range),
1210    clamp_value(stg5[20] + stg5[23], range),
1211    clamp_value(stg5[24] + stg5[27], range),
1212    clamp_value(stg5[25] + stg5[26], range),
1213    clamp_value(stg5[25] - stg5[26], range),
1214    clamp_value(stg5[24] - stg5[27], range),
1215    clamp_value(-stg5[28] + stg5[31], range),
1216    clamp_value(-stg5[29] + stg5[30], range),
1217    clamp_value(stg5[29] + stg5[30], range),
1218    clamp_value(stg5[28] + stg5[31], range),
1219    stg5[32],
1220    stg5[33],
1221    half_btf(-COSPI_INV[8], stg5[34], COSPI_INV[56], stg5[61], INV_COS_BIT),
1222    half_btf(-COSPI_INV[8], stg5[35], COSPI_INV[56], stg5[60], INV_COS_BIT),
1223    half_btf(-COSPI_INV[56], stg5[36], -COSPI_INV[8], stg5[59], INV_COS_BIT),
1224    half_btf(-COSPI_INV[56], stg5[37], -COSPI_INV[8], stg5[58], INV_COS_BIT),
1225    stg5[38],
1226    stg5[39],
1227    stg5[40],
1228    stg5[41],
1229    half_btf(-COSPI_INV[40], stg5[42], COSPI_INV[24], stg5[53], INV_COS_BIT),
1230    half_btf(-COSPI_INV[40], stg5[43], COSPI_INV[24], stg5[52], INV_COS_BIT),
1231    half_btf(-COSPI_INV[24], stg5[44], -COSPI_INV[40], stg5[51], INV_COS_BIT),
1232    half_btf(-COSPI_INV[24], stg5[45], -COSPI_INV[40], stg5[50], INV_COS_BIT),
1233    stg5[46],
1234    stg5[47],
1235    stg5[48],
1236    stg5[49],
1237    half_btf(-COSPI_INV[40], stg5[45], COSPI_INV[24], stg5[50], INV_COS_BIT),
1238    half_btf(-COSPI_INV[40], stg5[44], COSPI_INV[24], stg5[51], INV_COS_BIT),
1239    half_btf(COSPI_INV[24], stg5[43], COSPI_INV[40], stg5[52], INV_COS_BIT),
1240    half_btf(COSPI_INV[24], stg5[42], COSPI_INV[40], stg5[53], INV_COS_BIT),
1241    stg5[54],
1242    stg5[55],
1243    stg5[56],
1244    stg5[57],
1245    half_btf(-COSPI_INV[8], stg5[37], COSPI_INV[56], stg5[58], INV_COS_BIT),
1246    half_btf(-COSPI_INV[8], stg5[36], COSPI_INV[56], stg5[59], INV_COS_BIT),
1247    half_btf(COSPI_INV[56], stg5[35], COSPI_INV[8], stg5[60], INV_COS_BIT),
1248    half_btf(COSPI_INV[56], stg5[34], COSPI_INV[8], stg5[61], INV_COS_BIT),
1249    stg5[62],
1250    stg5[63],
1251  ];
1252
1253  // stage 7
1254  let stg7 = [
1255    clamp_value(stg6[0] + stg6[3], range),
1256    clamp_value(stg6[1] + stg6[2], range),
1257    clamp_value(stg6[1] - stg6[2], range),
1258    clamp_value(stg6[0] - stg6[3], range),
1259    stg6[4],
1260    half_btf(-COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT),
1261    half_btf(COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT),
1262    stg6[7],
1263    clamp_value(stg6[8] + stg6[11], range),
1264    clamp_value(stg6[9] + stg6[10], range),
1265    clamp_value(stg6[9] - stg6[10], range),
1266    clamp_value(stg6[8] - stg6[11], range),
1267    clamp_value(-stg6[12] + stg6[15], range),
1268    clamp_value(-stg6[13] + stg6[14], range),
1269    clamp_value(stg6[13] + stg6[14], range),
1270    clamp_value(stg6[12] + stg6[15], range),
1271    stg6[16],
1272    stg6[17],
1273    half_btf(-COSPI_INV[16], stg6[18], COSPI_INV[48], stg6[29], INV_COS_BIT),
1274    half_btf(-COSPI_INV[16], stg6[19], COSPI_INV[48], stg6[28], INV_COS_BIT),
1275    half_btf(-COSPI_INV[48], stg6[20], -COSPI_INV[16], stg6[27], INV_COS_BIT),
1276    half_btf(-COSPI_INV[48], stg6[21], -COSPI_INV[16], stg6[26], INV_COS_BIT),
1277    stg6[22],
1278    stg6[23],
1279    stg6[24],
1280    stg6[25],
1281    half_btf(-COSPI_INV[16], stg6[21], COSPI_INV[48], stg6[26], INV_COS_BIT),
1282    half_btf(-COSPI_INV[16], stg6[20], COSPI_INV[48], stg6[27], INV_COS_BIT),
1283    half_btf(COSPI_INV[48], stg6[19], COSPI_INV[16], stg6[28], INV_COS_BIT),
1284    half_btf(COSPI_INV[48], stg6[18], COSPI_INV[16], stg6[29], INV_COS_BIT),
1285    stg6[30],
1286    stg6[31],
1287    clamp_value(stg6[32] + stg6[39], range),
1288    clamp_value(stg6[33] + stg6[38], range),
1289    clamp_value(stg6[34] + stg6[37], range),
1290    clamp_value(stg6[35] + stg6[36], range),
1291    clamp_value(stg6[35] - stg6[36], range),
1292    clamp_value(stg6[34] - stg6[37], range),
1293    clamp_value(stg6[33] - stg6[38], range),
1294    clamp_value(stg6[32] - stg6[39], range),
1295    clamp_value(-stg6[40] + stg6[47], range),
1296    clamp_value(-stg6[41] + stg6[46], range),
1297    clamp_value(-stg6[42] + stg6[45], range),
1298    clamp_value(-stg6[43] + stg6[44], range),
1299    clamp_value(stg6[43] + stg6[44], range),
1300    clamp_value(stg6[42] + stg6[45], range),
1301    clamp_value(stg6[41] + stg6[46], range),
1302    clamp_value(stg6[40] + stg6[47], range),
1303    clamp_value(stg6[48] + stg6[55], range),
1304    clamp_value(stg6[49] + stg6[54], range),
1305    clamp_value(stg6[50] + stg6[53], range),
1306    clamp_value(stg6[51] + stg6[52], range),
1307    clamp_value(stg6[51] - stg6[52], range),
1308    clamp_value(stg6[50] - stg6[53], range),
1309    clamp_value(stg6[49] - stg6[54], range),
1310    clamp_value(stg6[48] - stg6[55], range),
1311    clamp_value(-stg6[56] + stg6[63], range),
1312    clamp_value(-stg6[57] + stg6[62], range),
1313    clamp_value(-stg6[58] + stg6[61], range),
1314    clamp_value(-stg6[59] + stg6[60], range),
1315    clamp_value(stg6[59] + stg6[60], range),
1316    clamp_value(stg6[58] + stg6[61], range),
1317    clamp_value(stg6[57] + stg6[62], range),
1318    clamp_value(stg6[56] + stg6[63], range),
1319  ];
1320
1321  // stage 8
1322  let stg8 = [
1323    clamp_value(stg7[0] + stg7[7], range),
1324    clamp_value(stg7[1] + stg7[6], range),
1325    clamp_value(stg7[2] + stg7[5], range),
1326    clamp_value(stg7[3] + stg7[4], range),
1327    clamp_value(stg7[3] - stg7[4], range),
1328    clamp_value(stg7[2] - stg7[5], range),
1329    clamp_value(stg7[1] - stg7[6], range),
1330    clamp_value(stg7[0] - stg7[7], range),
1331    stg7[8],
1332    stg7[9],
1333    half_btf(-COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT),
1334    half_btf(-COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT),
1335    half_btf(COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT),
1336    half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT),
1337    stg7[14],
1338    stg7[15],
1339    clamp_value(stg7[16] + stg7[23], range),
1340    clamp_value(stg7[17] + stg7[22], range),
1341    clamp_value(stg7[18] + stg7[21], range),
1342    clamp_value(stg7[19] + stg7[20], range),
1343    clamp_value(stg7[19] - stg7[20], range),
1344    clamp_value(stg7[18] - stg7[21], range),
1345    clamp_value(stg7[17] - stg7[22], range),
1346    clamp_value(stg7[16] - stg7[23], range),
1347    clamp_value(-stg7[24] + stg7[31], range),
1348    clamp_value(-stg7[25] + stg7[30], range),
1349    clamp_value(-stg7[26] + stg7[29], range),
1350    clamp_value(-stg7[27] + stg7[28], range),
1351    clamp_value(stg7[27] + stg7[28], range),
1352    clamp_value(stg7[26] + stg7[29], range),
1353    clamp_value(stg7[25] + stg7[30], range),
1354    clamp_value(stg7[24] + stg7[31], range),
1355    stg7[32],
1356    stg7[33],
1357    stg7[34],
1358    stg7[35],
1359    half_btf(-COSPI_INV[16], stg7[36], COSPI_INV[48], stg7[59], INV_COS_BIT),
1360    half_btf(-COSPI_INV[16], stg7[37], COSPI_INV[48], stg7[58], INV_COS_BIT),
1361    half_btf(-COSPI_INV[16], stg7[38], COSPI_INV[48], stg7[57], INV_COS_BIT),
1362    half_btf(-COSPI_INV[16], stg7[39], COSPI_INV[48], stg7[56], INV_COS_BIT),
1363    half_btf(-COSPI_INV[48], stg7[40], -COSPI_INV[16], stg7[55], INV_COS_BIT),
1364    half_btf(-COSPI_INV[48], stg7[41], -COSPI_INV[16], stg7[54], INV_COS_BIT),
1365    half_btf(-COSPI_INV[48], stg7[42], -COSPI_INV[16], stg7[53], INV_COS_BIT),
1366    half_btf(-COSPI_INV[48], stg7[43], -COSPI_INV[16], stg7[52], INV_COS_BIT),
1367    stg7[44],
1368    stg7[45],
1369    stg7[46],
1370    stg7[47],
1371    stg7[48],
1372    stg7[49],
1373    stg7[50],
1374    stg7[51],
1375    half_btf(-COSPI_INV[16], stg7[43], COSPI_INV[48], stg7[52], INV_COS_BIT),
1376    half_btf(-COSPI_INV[16], stg7[42], COSPI_INV[48], stg7[53], INV_COS_BIT),
1377    half_btf(-COSPI_INV[16], stg7[41], COSPI_INV[48], stg7[54], INV_COS_BIT),
1378    half_btf(-COSPI_INV[16], stg7[40], COSPI_INV[48], stg7[55], INV_COS_BIT),
1379    half_btf(COSPI_INV[48], stg7[39], COSPI_INV[16], stg7[56], INV_COS_BIT),
1380    half_btf(COSPI_INV[48], stg7[38], COSPI_INV[16], stg7[57], INV_COS_BIT),
1381    half_btf(COSPI_INV[48], stg7[37], COSPI_INV[16], stg7[58], INV_COS_BIT),
1382    half_btf(COSPI_INV[48], stg7[36], COSPI_INV[16], stg7[59], INV_COS_BIT),
1383    stg7[60],
1384    stg7[61],
1385    stg7[62],
1386    stg7[63],
1387  ];
1388
1389  // stage 9
1390  let stg9 = [
1391    clamp_value(stg8[0] + stg8[15], range),
1392    clamp_value(stg8[1] + stg8[14], range),
1393    clamp_value(stg8[2] + stg8[13], range),
1394    clamp_value(stg8[3] + stg8[12], range),
1395    clamp_value(stg8[4] + stg8[11], range),
1396    clamp_value(stg8[5] + stg8[10], range),
1397    clamp_value(stg8[6] + stg8[9], range),
1398    clamp_value(stg8[7] + stg8[8], range),
1399    clamp_value(stg8[7] - stg8[8], range),
1400    clamp_value(stg8[6] - stg8[9], range),
1401    clamp_value(stg8[5] - stg8[10], range),
1402    clamp_value(stg8[4] - stg8[11], range),
1403    clamp_value(stg8[3] - stg8[12], range),
1404    clamp_value(stg8[2] - stg8[13], range),
1405    clamp_value(stg8[1] - stg8[14], range),
1406    clamp_value(stg8[0] - stg8[15], range),
1407    stg8[16],
1408    stg8[17],
1409    stg8[18],
1410    stg8[19],
1411    half_btf(-COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT),
1412    half_btf(-COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT),
1413    half_btf(-COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT),
1414    half_btf(-COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT),
1415    half_btf(COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT),
1416    half_btf(COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT),
1417    half_btf(COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT),
1418    half_btf(COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT),
1419    stg8[28],
1420    stg8[29],
1421    stg8[30],
1422    stg8[31],
1423    clamp_value(stg8[32] + stg8[47], range),
1424    clamp_value(stg8[33] + stg8[46], range),
1425    clamp_value(stg8[34] + stg8[45], range),
1426    clamp_value(stg8[35] + stg8[44], range),
1427    clamp_value(stg8[36] + stg8[43], range),
1428    clamp_value(stg8[37] + stg8[42], range),
1429    clamp_value(stg8[38] + stg8[41], range),
1430    clamp_value(stg8[39] + stg8[40], range),
1431    clamp_value(stg8[39] - stg8[40], range),
1432    clamp_value(stg8[38] - stg8[41], range),
1433    clamp_value(stg8[37] - stg8[42], range),
1434    clamp_value(stg8[36] - stg8[43], range),
1435    clamp_value(stg8[35] - stg8[44], range),
1436    clamp_value(stg8[34] - stg8[45], range),
1437    clamp_value(stg8[33] - stg8[46], range),
1438    clamp_value(stg8[32] - stg8[47], range),
1439    clamp_value(-stg8[48] + stg8[63], range),
1440    clamp_value(-stg8[49] + stg8[62], range),
1441    clamp_value(-stg8[50] + stg8[61], range),
1442    clamp_value(-stg8[51] + stg8[60], range),
1443    clamp_value(-stg8[52] + stg8[59], range),
1444    clamp_value(-stg8[53] + stg8[58], range),
1445    clamp_value(-stg8[54] + stg8[57], range),
1446    clamp_value(-stg8[55] + stg8[56], range),
1447    clamp_value(stg8[55] + stg8[56], range),
1448    clamp_value(stg8[54] + stg8[57], range),
1449    clamp_value(stg8[53] + stg8[58], range),
1450    clamp_value(stg8[52] + stg8[59], range),
1451    clamp_value(stg8[51] + stg8[60], range),
1452    clamp_value(stg8[50] + stg8[61], range),
1453    clamp_value(stg8[49] + stg8[62], range),
1454    clamp_value(stg8[48] + stg8[63], range),
1455  ];
1456
1457  // stage 10
1458  let stg10 = [
1459    clamp_value(stg9[0] + stg9[31], range),
1460    clamp_value(stg9[1] + stg9[30], range),
1461    clamp_value(stg9[2] + stg9[29], range),
1462    clamp_value(stg9[3] + stg9[28], range),
1463    clamp_value(stg9[4] + stg9[27], range),
1464    clamp_value(stg9[5] + stg9[26], range),
1465    clamp_value(stg9[6] + stg9[25], range),
1466    clamp_value(stg9[7] + stg9[24], range),
1467    clamp_value(stg9[8] + stg9[23], range),
1468    clamp_value(stg9[9] + stg9[22], range),
1469    clamp_value(stg9[10] + stg9[21], range),
1470    clamp_value(stg9[11] + stg9[20], range),
1471    clamp_value(stg9[12] + stg9[19], range),
1472    clamp_value(stg9[13] + stg9[18], range),
1473    clamp_value(stg9[14] + stg9[17], range),
1474    clamp_value(stg9[15] + stg9[16], range),
1475    clamp_value(stg9[15] - stg9[16], range),
1476    clamp_value(stg9[14] - stg9[17], range),
1477    clamp_value(stg9[13] - stg9[18], range),
1478    clamp_value(stg9[12] - stg9[19], range),
1479    clamp_value(stg9[11] - stg9[20], range),
1480    clamp_value(stg9[10] - stg9[21], range),
1481    clamp_value(stg9[9] - stg9[22], range),
1482    clamp_value(stg9[8] - stg9[23], range),
1483    clamp_value(stg9[7] - stg9[24], range),
1484    clamp_value(stg9[6] - stg9[25], range),
1485    clamp_value(stg9[5] - stg9[26], range),
1486    clamp_value(stg9[4] - stg9[27], range),
1487    clamp_value(stg9[3] - stg9[28], range),
1488    clamp_value(stg9[2] - stg9[29], range),
1489    clamp_value(stg9[1] - stg9[30], range),
1490    clamp_value(stg9[0] - stg9[31], range),
1491    stg9[32],
1492    stg9[33],
1493    stg9[34],
1494    stg9[35],
1495    stg9[36],
1496    stg9[37],
1497    stg9[38],
1498    stg9[39],
1499    half_btf(-COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT),
1500    half_btf(-COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT),
1501    half_btf(-COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT),
1502    half_btf(-COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT),
1503    half_btf(-COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT),
1504    half_btf(-COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT),
1505    half_btf(-COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT),
1506    half_btf(-COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT),
1507    half_btf(COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT),
1508    half_btf(COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT),
1509    half_btf(COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT),
1510    half_btf(COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT),
1511    half_btf(COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT),
1512    half_btf(COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT),
1513    half_btf(COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT),
1514    half_btf(COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT),
1515    stg9[56],
1516    stg9[57],
1517    stg9[58],
1518    stg9[59],
1519    stg9[60],
1520    stg9[61],
1521    stg9[62],
1522    stg9[63],
1523  ];
1524
1525  // stage 11
1526  output[0] = clamp_value(stg10[0] + stg10[63], range);
1527  output[1] = clamp_value(stg10[1] + stg10[62], range);
1528  output[2] = clamp_value(stg10[2] + stg10[61], range);
1529  output[3] = clamp_value(stg10[3] + stg10[60], range);
1530  output[4] = clamp_value(stg10[4] + stg10[59], range);
1531  output[5] = clamp_value(stg10[5] + stg10[58], range);
1532  output[6] = clamp_value(stg10[6] + stg10[57], range);
1533  output[7] = clamp_value(stg10[7] + stg10[56], range);
1534  output[8] = clamp_value(stg10[8] + stg10[55], range);
1535  output[9] = clamp_value(stg10[9] + stg10[54], range);
1536  output[10] = clamp_value(stg10[10] + stg10[53], range);
1537  output[11] = clamp_value(stg10[11] + stg10[52], range);
1538  output[12] = clamp_value(stg10[12] + stg10[51], range);
1539  output[13] = clamp_value(stg10[13] + stg10[50], range);
1540  output[14] = clamp_value(stg10[14] + stg10[49], range);
1541  output[15] = clamp_value(stg10[15] + stg10[48], range);
1542  output[16] = clamp_value(stg10[16] + stg10[47], range);
1543  output[17] = clamp_value(stg10[17] + stg10[46], range);
1544  output[18] = clamp_value(stg10[18] + stg10[45], range);
1545  output[19] = clamp_value(stg10[19] + stg10[44], range);
1546  output[20] = clamp_value(stg10[20] + stg10[43], range);
1547  output[21] = clamp_value(stg10[21] + stg10[42], range);
1548  output[22] = clamp_value(stg10[22] + stg10[41], range);
1549  output[23] = clamp_value(stg10[23] + stg10[40], range);
1550  output[24] = clamp_value(stg10[24] + stg10[39], range);
1551  output[25] = clamp_value(stg10[25] + stg10[38], range);
1552  output[26] = clamp_value(stg10[26] + stg10[37], range);
1553  output[27] = clamp_value(stg10[27] + stg10[36], range);
1554  output[28] = clamp_value(stg10[28] + stg10[35], range);
1555  output[29] = clamp_value(stg10[29] + stg10[34], range);
1556  output[30] = clamp_value(stg10[30] + stg10[33], range);
1557  output[31] = clamp_value(stg10[31] + stg10[32], range);
1558  output[32] = clamp_value(stg10[31] - stg10[32], range);
1559  output[33] = clamp_value(stg10[30] - stg10[33], range);
1560  output[34] = clamp_value(stg10[29] - stg10[34], range);
1561  output[35] = clamp_value(stg10[28] - stg10[35], range);
1562  output[36] = clamp_value(stg10[27] - stg10[36], range);
1563  output[37] = clamp_value(stg10[26] - stg10[37], range);
1564  output[38] = clamp_value(stg10[25] - stg10[38], range);
1565  output[39] = clamp_value(stg10[24] - stg10[39], range);
1566  output[40] = clamp_value(stg10[23] - stg10[40], range);
1567  output[41] = clamp_value(stg10[22] - stg10[41], range);
1568  output[42] = clamp_value(stg10[21] - stg10[42], range);
1569  output[43] = clamp_value(stg10[20] - stg10[43], range);
1570  output[44] = clamp_value(stg10[19] - stg10[44], range);
1571  output[45] = clamp_value(stg10[18] - stg10[45], range);
1572  output[46] = clamp_value(stg10[17] - stg10[46], range);
1573  output[47] = clamp_value(stg10[16] - stg10[47], range);
1574  output[48] = clamp_value(stg10[15] - stg10[48], range);
1575  output[49] = clamp_value(stg10[14] - stg10[49], range);
1576  output[50] = clamp_value(stg10[13] - stg10[50], range);
1577  output[51] = clamp_value(stg10[12] - stg10[51], range);
1578  output[52] = clamp_value(stg10[11] - stg10[52], range);
1579  output[53] = clamp_value(stg10[10] - stg10[53], range);
1580  output[54] = clamp_value(stg10[9] - stg10[54], range);
1581  output[55] = clamp_value(stg10[8] - stg10[55], range);
1582  output[56] = clamp_value(stg10[7] - stg10[56], range);
1583  output[57] = clamp_value(stg10[6] - stg10[57], range);
1584  output[58] = clamp_value(stg10[5] - stg10[58], range);
1585  output[59] = clamp_value(stg10[4] - stg10[59], range);
1586  output[60] = clamp_value(stg10[3] - stg10[60], range);
1587  output[61] = clamp_value(stg10[2] - stg10[61], range);
1588  output[62] = clamp_value(stg10[1] - stg10[62], range);
1589  output[63] = clamp_value(stg10[0] - stg10[63], range);
1590}
1591
1592type InvTxfmFn = fn(input: &[i32], output: &mut [i32], range: usize);
1593
1594static INV_TXFM_FNS: [[InvTxfmFn; 5]; 5] = [
1595  [av1_idct4, av1_idct8, av1_idct16, av1_idct32, av1_idct64],
1596  [
1597    av1_iadst4,
1598    av1_iadst8,
1599    av1_iadst16,
1600    |_, _, _| unimplemented!(),
1601    |_, _, _| unimplemented!(),
1602  ],
1603  [
1604    av1_iflipadst4,
1605    av1_iflipadst8,
1606    av1_iflipadst16,
1607    |_, _, _| unimplemented!(),
1608    |_, _, _| unimplemented!(),
1609  ],
1610  [
1611    av1_iidentity4,
1612    av1_iidentity8,
1613    av1_iidentity16,
1614    av1_iidentity32,
1615    |_, _, _| unimplemented!(),
1616  ],
1617  [
1618    av1_iwht4,
1619    |_, _, _| unimplemented!(),
1620    |_, _, _| unimplemented!(),
1621    |_, _, _| unimplemented!(),
1622    |_, _, _| unimplemented!(),
1623  ],
1624];
1625
1626pub(crate) mod rust {
1627  use super::*;
1628  use crate::cpu_features::CpuFeatureLevel;
1629  use crate::util::clamp;
1630
1631  use simd_helpers::cold_for_target_arch;
1632  use std::cmp;
1633
1634  #[cold_for_target_arch("x86_64", "aarch64")]
1635  pub fn inverse_transform_add<T: Pixel>(
1636    input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: u16,
1637    tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
1638  ) {
1639    let width: usize = tx_size.width();
1640    let height: usize = tx_size.height();
1641
1642    // Only use at most 32 columns and 32 rows of input coefficients.
1643    let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
1644
1645    // For 64 point transforms, rely on the last 32 columns being initialized
1646    //   to zero for filling out missing input coeffs.
1647    let mut buffer = vec![0i32; width * height].into_boxed_slice();
1648    let rect_type = get_rect_tx_log_ratio(width, height);
1649    let tx_types_1d = get_1d_tx_types(tx_type);
1650    let lossless = tx_type == TxType::WHT_WHT;
1651
1652    // perform inv txfm on every row
1653    let range = bd + 8;
1654    let txfm_fn = INV_TXFM_FNS[tx_types_1d.1 as usize][ILog::ilog(width) - 3];
1655    // 64 point transforms only signal 32 coeffs. We only take chunks of 32
1656    //   and skip over the last 32 transforms here.
1657    for (r, buffer_slice) in (0..height.min(32)).zip(buffer.chunks_mut(width))
1658    {
1659      // For 64 point transforms, rely on the last 32 elements being
1660      //   initialized to zero for filling out the missing coeffs.
1661      let mut temp_in: [i32; 64] = [0; 64];
1662      for (raw, clamped) in input[r..]
1663        .iter()
1664        .map(|a| i32::cast_from(*a))
1665        .step_by(height.min(32))
1666        .zip(temp_in.iter_mut())
1667      {
1668        let val = if rect_type.abs() == 1 {
1669          round_shift(raw * INV_SQRT2, SQRT2_BITS)
1670        } else if lossless {
1671          raw >> 2
1672        } else {
1673          raw
1674        };
1675        *clamped = clamp_value(val, range);
1676      }
1677      txfm_fn(&temp_in, buffer_slice, range);
1678    }
1679
1680    // perform inv txfm on every col
1681    let range = cmp::max(bd + 6, 16);
1682    let txfm_fn = INV_TXFM_FNS[tx_types_1d.0 as usize][ILog::ilog(height) - 3];
1683    for c in 0..width {
1684      let mut temp_in: [i32; 64] = [0; 64];
1685      let mut temp_out: [i32; 64] = [0; 64];
1686      for (raw, clamped) in
1687        buffer[c..].iter().step_by(width).zip(temp_in.iter_mut())
1688      {
1689        *clamped = clamp_value(
1690          round_shift(*raw, INV_INTERMEDIATE_SHIFTS[tx_size as usize]),
1691          range,
1692        );
1693      }
1694      txfm_fn(&temp_in, &mut temp_out, range);
1695      for (temp, out) in temp_out
1696        .iter()
1697        .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height))
1698      {
1699        let v: i32 = (*out).as_();
1700        let r = if lossless { *temp } else { round_shift(*temp, 4) };
1701        let v = clamp(v + r, 0, (1 << bd) - 1);
1702        *out = T::cast_from(v);
1703      }
1704    }
1705  }
1706
1707  /* From AV1 Spec.
1708  https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process
1709  */
1710  const INV_INTERMEDIATE_SHIFTS: [usize; TxSize::TX_SIZES_ALL] =
1711    [0, 1, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2];
1712}