rav1e/transform/
forward_shared.rs

1// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10use super::TxSize;
11use super::TxType;
12
13use super::HTX_TAB;
14use super::VTX_TAB;
15
16pub type TxfmShift = [i8; 3];
17pub type TxfmShifts = [TxfmShift; 3];
18
19// Shift so that the first shift is 4 - (bd - 8) to align with the initial
20// design of daala_tx
21// 8 bit 4x4 is an exception and only shifts by 3 in the first stage
22const FWD_SHIFT_4X4: TxfmShifts = [[3, 0, 0], [2, 0, 1], [0, 0, 3]];
23const FWD_SHIFT_8X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
24const FWD_SHIFT_16X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
25const FWD_SHIFT_32X32: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
26const FWD_SHIFT_64X64: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]];
27const FWD_SHIFT_4X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
28const FWD_SHIFT_8X4: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
29const FWD_SHIFT_8X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
30const FWD_SHIFT_16X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
31const FWD_SHIFT_16X32: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
32const FWD_SHIFT_32X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
33const FWD_SHIFT_32X64: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]];
34const FWD_SHIFT_64X32: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]];
35const FWD_SHIFT_4X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
36const FWD_SHIFT_16X4: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
37const FWD_SHIFT_8X32: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
38const FWD_SHIFT_32X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
39const FWD_SHIFT_16X64: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
40const FWD_SHIFT_64X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
41
42const FWD_SHIFT_4X4_WHT: TxfmShift = [0, 0, 2];
43
44pub const FWD_TXFM_SHIFT_LS: [TxfmShifts; TxSize::TX_SIZES_ALL] = [
45  FWD_SHIFT_4X4,
46  FWD_SHIFT_8X8,
47  FWD_SHIFT_16X16,
48  FWD_SHIFT_32X32,
49  FWD_SHIFT_64X64,
50  FWD_SHIFT_4X8,
51  FWD_SHIFT_8X4,
52  FWD_SHIFT_8X16,
53  FWD_SHIFT_16X8,
54  FWD_SHIFT_16X32,
55  FWD_SHIFT_32X16,
56  FWD_SHIFT_32X64,
57  FWD_SHIFT_64X32,
58  FWD_SHIFT_4X16,
59  FWD_SHIFT_16X4,
60  FWD_SHIFT_8X32,
61  FWD_SHIFT_32X8,
62  FWD_SHIFT_16X64,
63  FWD_SHIFT_64X16,
64];
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq)]
67pub enum TxfmType {
68  DCT4,
69  DCT8,
70  DCT16,
71  DCT32,
72  DCT64,
73  ADST4,
74  ADST8,
75  ADST16,
76  Identity4,
77  Identity8,
78  Identity16,
79  Identity32,
80  WHT4,
81}
82
83impl TxfmType {
84  const TX_TYPES_1D: usize = 5;
85  const AV1_TXFM_TYPE_LS: [[Option<TxfmType>; Self::TX_TYPES_1D]; 5] = [
86    [
87      Some(TxfmType::DCT4),
88      Some(TxfmType::ADST4),
89      Some(TxfmType::ADST4),
90      Some(TxfmType::Identity4),
91      Some(TxfmType::WHT4),
92    ],
93    [
94      Some(TxfmType::DCT8),
95      Some(TxfmType::ADST8),
96      Some(TxfmType::ADST8),
97      Some(TxfmType::Identity8),
98      None,
99    ],
100    [
101      Some(TxfmType::DCT16),
102      Some(TxfmType::ADST16),
103      Some(TxfmType::ADST16),
104      Some(TxfmType::Identity16),
105      None,
106    ],
107    [Some(TxfmType::DCT32), None, None, Some(TxfmType::Identity32), None],
108    [Some(TxfmType::DCT64), None, None, None, None],
109  ];
110}
111
112#[derive(Debug, Clone, Copy)]
113pub struct Txfm2DFlipCfg {
114  pub tx_size: TxSize,
115  /// Flip upside down
116  pub ud_flip: bool,
117  /// Flip left to right
118  pub lr_flip: bool,
119  pub shift: TxfmShift,
120  pub txfm_type_col: TxfmType,
121  pub txfm_type_row: TxfmType,
122}
123
124impl Txfm2DFlipCfg {
125  /// # Panics
126  ///
127  /// - If called with an invalid combination of `tx_size` and `tx_type`
128  pub fn fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self {
129    let tx_type_1d_col = VTX_TAB[tx_type as usize];
130    let tx_type_1d_row = HTX_TAB[tx_type as usize];
131    let txw_idx = tx_size.width_index();
132    let txh_idx = tx_size.height_index();
133    let txfm_type_col =
134      TxfmType::AV1_TXFM_TYPE_LS[txh_idx][tx_type_1d_col as usize].unwrap();
135    let txfm_type_row =
136      TxfmType::AV1_TXFM_TYPE_LS[txw_idx][tx_type_1d_row as usize].unwrap();
137    let (ud_flip, lr_flip) = Self::get_flip_cfg(tx_type);
138    let shift = if tx_type == TxType::WHT_WHT {
139      FWD_SHIFT_4X4_WHT
140    } else {
141      FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2]
142    };
143
144    Txfm2DFlipCfg {
145      tx_size,
146      ud_flip,
147      lr_flip,
148      shift,
149      txfm_type_col,
150      txfm_type_row,
151    }
152  }
153
154  /// Determine the flip config, returning `(ud_flip, lr_flip)`
155  const fn get_flip_cfg(tx_type: TxType) -> (bool, bool) {
156    use self::TxType::*;
157    match tx_type {
158      DCT_DCT | ADST_DCT | DCT_ADST | ADST_ADST | IDTX | V_DCT | H_DCT
159      | V_ADST | H_ADST | WHT_WHT => (false, false),
160      FLIPADST_DCT | FLIPADST_ADST | V_FLIPADST => (true, false),
161      DCT_FLIPADST | ADST_FLIPADST | H_FLIPADST => (false, true),
162      FLIPADST_FLIPADST => (true, true),
163    }
164  }
165}
166
167macro_rules! store_coeffs {
168  ( $arr:expr, $( $x:expr ),* ) => {
169      {
170      let mut i: i32 = -1;
171      $(
172        i += 1;
173        $arr[i as usize] = $x;
174      )*
175    }
176  };
177}
178
179macro_rules! impl_1d_tx {
180() => {
181  impl_1d_tx! {allow(unused_attributes), }
182};
183
184($m:meta, $($s:ident),*) => {
185  pub trait TxOperations: Copy {
186    $($s)* fn zero() -> Self;
187
188    $($s)* fn tx_mul<const SHIFT: i32>(self, mul: i32) -> Self;
189    $($s)* fn rshift1(self) -> Self;
190    $($s)* fn add(self, b: Self) -> Self;
191    $($s)* fn sub(self, b: Self) -> Self;
192    $($s)* fn add_avg(self, b: Self) -> Self;
193    $($s)* fn sub_avg(self, b: Self) -> Self;
194
195    $($s)* fn copy_fn(self) -> Self {
196      self
197    }
198  }
199
200  #[inline]
201  fn get_func(t: TxfmType) -> TxfmFunc {
202    use self::TxfmType::*;
203    match t {
204      DCT4 => daala_fdct4,
205      DCT8 => daala_fdct8,
206      DCT16 => daala_fdct16,
207      DCT32 => daala_fdct32,
208      DCT64 => daala_fdct64,
209      ADST4 => daala_fdst_vii_4,
210      ADST8 => daala_fdst8,
211      ADST16 => daala_fdst16,
212      Identity4 => fidentity,
213      Identity8 => fidentity,
214      Identity16 => fidentity,
215      Identity32 => fidentity,
216      WHT4 => fwht4,
217    }
218  }
219
220  trait RotateKernelPi4<T: TxOperations> {
221  const ADD: $($s)* fn(T, T) -> T;
222  const SUB: $($s)* fn(T, T) -> T;
223
224  #[$m]
225  $($s)* fn kernel<const SHIFT0: i32, const SHIFT1: i32>(p0: T, p1: T, m: (i32, i32)) -> (T, T) {
226    let t = Self::ADD(p1, p0);
227    let (a, out0) = (p0.tx_mul::<SHIFT0>(m.0), t.tx_mul::<SHIFT1>(m.1));
228    let out1 = Self::SUB(a, out0);
229    (out0, out1)
230  }
231}
232
233struct RotatePi4Add;
234struct RotatePi4AddAvg;
235struct RotatePi4Sub;
236struct RotatePi4SubAvg;
237
238impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4Add {
239  const ADD: $($s)* fn(T, T) -> T = T::add;
240  const SUB: $($s)* fn(T, T) -> T = T::sub;
241}
242
243impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4AddAvg {
244  const ADD: $($s)* fn(T, T) -> T = T::add_avg;
245  const SUB: $($s)* fn(T, T) -> T = T::sub;
246}
247
248impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4Sub {
249  const ADD: $($s)* fn(T, T) -> T = T::sub;
250  const SUB: $($s)* fn(T, T) -> T = T::add;
251}
252
253impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4SubAvg {
254  const ADD: $($s)* fn(T, T) -> T = T::sub_avg;
255  const SUB: $($s)* fn(T, T) -> T = T::add;
256}
257
258trait RotateKernel<T: TxOperations> {
259  const ADD: $($s)* fn(T, T) -> T;
260  const SUB: $($s)* fn(T, T) -> T;
261  const SHIFT: $($s)* fn(T) -> T;
262
263  #[$m]
264  $($s)* fn half_kernel<const SHIFT0: i32, const SHIFT1: i32, const SHIFT2: i32>(
265    p0: (T, T), p1: T, m: (i32, i32, i32),
266  ) -> (T, T) {
267    let t = Self::ADD(p1, p0.0);
268    let (a, b, c) = (p0.1.tx_mul::<SHIFT0>(m.0), p1.tx_mul::<SHIFT1>(m.1), t.tx_mul::<SHIFT2>(m.2));
269    let out0 = b.add(c);
270    let shifted = Self::SHIFT(c);
271    let out1 = Self::SUB(a, shifted);
272    (out0, out1)
273  }
274
275  #[$m]
276  $($s)* fn kernel<const SHIFT0: i32, const SHIFT1: i32, const SHIFT2: i32>(p0: T, p1: T, m: (i32, i32, i32)) -> (T, T) {
277    Self::half_kernel::<SHIFT0, SHIFT1, SHIFT2>((p0, p0), p1, m)
278  }
279}
280
281trait RotateKernelNeg<T: TxOperations> {
282  const ADD: $($s)* fn(T, T) -> T;
283
284  #[$m]
285  $($s)* fn kernel<const SHIFT0: i32, const SHIFT1: i32, const SHIFT2: i32>(p0: T, p1: T, m: (i32, i32, i32)) -> (T, T) {
286    let t = Self::ADD(p0, p1);
287    let (a, b, c) = (p0.tx_mul::<SHIFT0>(m.0), p1.tx_mul::<SHIFT1>(m.1), t.tx_mul::<SHIFT2>(m.2));
288    let out0 = b.sub(c);
289    let out1 = c.sub(a);
290    (out0, out1)
291  }
292}
293
294struct RotateAdd;
295struct RotateAddAvg;
296struct RotateAddShift;
297struct RotateSub;
298struct RotateSubAvg;
299struct RotateSubShift;
300struct RotateNeg;
301struct RotateNegAvg;
302
303impl<T: TxOperations> RotateKernel<T> for RotateAdd {
304  const ADD: $($s)* fn(T, T) -> T = T::add;
305  const SUB: $($s)* fn(T, T) -> T = T::sub;
306  const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
307}
308
309impl<T: TxOperations> RotateKernel<T> for RotateAddAvg {
310  const ADD: $($s)* fn(T, T) -> T = T::add_avg;
311  const SUB: $($s)* fn(T, T) -> T = T::sub;
312  const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
313}
314
315impl<T: TxOperations> RotateKernel<T> for RotateAddShift {
316  const ADD: $($s)* fn(T, T) -> T = T::add;
317  const SUB: $($s)* fn(T, T) -> T = T::sub;
318  const SHIFT: $($s)* fn(T) -> T = T::rshift1;
319}
320
321impl<T: TxOperations> RotateKernel<T> for RotateSub {
322  const ADD: $($s)* fn(T, T) -> T = T::sub;
323  const SUB: $($s)* fn(T, T) -> T = T::add;
324  const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
325}
326
327impl<T: TxOperations> RotateKernel<T> for RotateSubAvg {
328  const ADD: $($s)* fn(T, T) -> T = T::sub_avg;
329  const SUB: $($s)* fn(T, T) -> T = T::add;
330  const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
331}
332
333impl<T: TxOperations> RotateKernel<T> for RotateSubShift {
334  const ADD: $($s)* fn(T, T) -> T = T::sub;
335  const SUB: $($s)* fn(T, T) -> T = T::add;
336  const SHIFT: $($s)* fn(T) -> T = T::rshift1;
337}
338
339impl<T: TxOperations> RotateKernelNeg<T> for RotateNeg {
340  const ADD: $($s)* fn(T, T) -> T = T::sub;
341}
342
343impl<T: TxOperations> RotateKernelNeg<T> for RotateNegAvg {
344  const ADD: $($s)* fn(T, T) -> T = T::sub_avg;
345}
346
347#[inline]
348#[$m]
349$($s)* fn butterfly_add<T: TxOperations>(p0: T, p1: T) -> ((T, T), T) {
350  let p0 = p0.add(p1);
351  let p0h = p0.rshift1();
352  let p1h = p1.sub(p0h);
353  ((p0h, p0), p1h)
354}
355
356#[inline]
357#[$m]
358$($s)* fn butterfly_sub<T: TxOperations>(p0: T, p1: T) -> ((T, T), T) {
359  let p0 = p0.sub(p1);
360  let p0h = p0.rshift1();
361  let p1h = p1.add(p0h);
362  ((p0h, p0), p1h)
363}
364
365#[inline]
366#[$m]
367$($s)* fn butterfly_neg<T: TxOperations>(p0: T, p1: T) -> (T, (T, T)) {
368  let p1 = p0.sub(p1);
369  let p1h = p1.rshift1();
370  let p0h = p0.sub(p1h);
371  (p0h, (p1h, p1))
372}
373
374#[inline]
375#[$m]
376$($s)* fn butterfly_add_asym<T: TxOperations>(p0: (T, T), p1h: T) -> (T, T) {
377  let p1 = p1h.add(p0.0);
378  let p0 = p0.1.sub(p1);
379  (p0, p1)
380}
381
382#[inline]
383#[$m]
384$($s)* fn butterfly_sub_asym<T: TxOperations>(p0: (T, T), p1h: T) -> (T, T) {
385  let p1 = p1h.sub(p0.0);
386  let p0 = p0.1.add(p1);
387  (p0, p1)
388}
389
390#[inline]
391#[$m]
392$($s)* fn butterfly_neg_asym<T: TxOperations>(p0h: T, p1: (T, T)) -> (T, T) {
393  let p0 = p0h.add(p1.0);
394  let p1 = p0.sub(p1.1);
395  (p0, p1)
396}
397
398#[$m]
399$($s)* fn daala_fdct_ii_2_asym<T: TxOperations>(p0h: T, p1: (T, T)) -> (T, T) {
400  butterfly_neg_asym(p0h, p1)
401}
402
403#[$m]
404$($s)* fn daala_fdst_iv_2_asym<T: TxOperations>(p0: (T, T), p1h: T) -> (T, T) {
405  //   473/512 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867
406  // 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795
407  // 4433/8192 = Cos[3*Pi/8]*Sqrt[2]                 = 0.5411961001461971
408  RotateAdd::half_kernel::<9, 12, 13>(p0, p1h, (473, 3135, 4433))
409}
410
411#[$m]
412$($s)* fn daala_fdct_ii_4<T: TxOperations>(
413  q0: T, q1: T, q2: T, q3: T, output: &mut [T],
414) {
415  // +/- Butterflies with asymmetric output.
416  let (q0h, q3) = butterfly_neg(q0, q3);
417  let (q1, q2h) = butterfly_add(q1, q2);
418
419  // Embedded 2-point transforms with asymmetric input.
420  let (q0, q1) = daala_fdct_ii_2_asym(q0h, q1);
421  let (q3, q2) = daala_fdst_iv_2_asym(q3, q2h);
422
423  store_coeffs!(output, q0, q1, q2, q3);
424}
425
426#[$m]
427$($s)* fn daala_fdct4<T: TxOperations>(coeffs: &mut [T]) {
428  assert!(coeffs.len() >= 4);
429  let mut temp_out: [T; 4] = [T::zero(); 4];
430  daala_fdct_ii_4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], &mut temp_out);
431
432  coeffs[0] = temp_out[0];
433  coeffs[1] = temp_out[2];
434  coeffs[2] = temp_out[1];
435  coeffs[3] = temp_out[3];
436}
437
438#[$m]
439$($s)* fn daala_fdst_vii_4<T: TxOperations>(coeffs: &mut [T]) {
440  assert!(coeffs.len() >= 4);
441
442  let q0 = coeffs[0];
443  let q1 = coeffs[1];
444  let q2 = coeffs[2];
445  let q3 = coeffs[3];
446  let t0 = q1.add(q3);
447  // t1 = (q0 + q1 - q3)/2
448  let t1 = q1.add(q0.sub_avg(t0));
449  let t2 = q0.sub(q1);
450  let t3 = q2;
451  let t4 = q0.add(q3);
452  // 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360
453  let t0 = t0.tx_mul::<14>(7021);
454  // 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252
455  let t1 = t1.tx_mul::<15>(37837);
456  // 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139
457  let t2 = t2.tx_mul::<15>(21513);
458  // 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252
459  let t3 = t3.tx_mul::<15>(37837);
460  // 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779
461  let t4 = t4.tx_mul::<11>(467);
462  let t3h = t3.rshift1();
463  let u4 = t4.add(t3h);
464  coeffs[0] = t0.add(u4);
465  coeffs[1] = t1;
466  coeffs[2] = t0.add(t2.sub(t3h));
467  coeffs[3] = t2.add(t3.sub(u4));
468}
469
470#[$m]
471$($s)* fn daala_fdct_ii_2<T: TxOperations>(p0: T, p1: T) -> (T, T) {
472  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4]  = 1.4142135623730951
473  // 11585/8192 = 2*Cos[Pi/4]            = 1.4142135623730951
474  let (p1, p0) = RotatePi4SubAvg::kernel::<13, 13>(p1, p0, (11585, 11585));
475  (p0, p1)
476}
477
478#[$m]
479$($s)* fn daala_fdst_iv_2<T: TxOperations>(p0: T, p1: T) -> (T, T) {
480  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8]  = 1.3065629648763766
481  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8]  = 0.5411961001461971
482  //  3135/4096 = 2*Cos[3*Pi/8]              = 0.7653668647301796
483  RotateAddAvg::kernel::<13, 14, 12>(p0, p1, (10703, 8867, 3135))
484}
485
486#[$m]
487$($s)* fn daala_fdct_ii_4_asym<T: TxOperations>(
488  q0h: T, q1: (T, T), q2h: T, q3: (T, T), output: &mut [T],
489) {
490  // +/- Butterflies with asymmetric input.
491  let (q0, q3) = butterfly_neg_asym(q0h, q3);
492  let (q1, q2) = butterfly_sub_asym(q1, q2h);
493
494  // Embedded 2-point orthonormal transforms.
495  let (q0, q1) = daala_fdct_ii_2(q0, q1);
496  let (q3, q2) = daala_fdst_iv_2(q3, q2);
497
498  store_coeffs!(output, q0, q1, q2, q3);
499}
500
501#[$m]
502$($s)* fn daala_fdst_iv_4_asym<T: TxOperations>(
503  q0: (T, T), q1h: T, q2: (T, T), q3h: T, output: &mut [T],
504) {
505  // Stage 0
506  //  9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793
507  //  12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045
508  // 12785/32768 = Cos[7*Pi/16]*2                  = 0.3901806440322565
509  let (q0, q3) = RotateAddShift::half_kernel::<14, 13, 15>(
510    q0,
511    q3h,
512    (9633, 12873, 12785),
513  );
514  // 11363/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738
515  // 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861
516  //  4551/4096 = Cos[5*Pi/16]*2                  = 1.1111404660392044
517  let (q2, q1) = RotateSubShift::half_kernel::<14, 15, 12>(
518    q2,
519    q1h,
520    (11363, 18081, 4551),
521  );
522
523  // Stage 1
524  let (q2, q3) = butterfly_sub_asym((q2.rshift1(), q2), q3);
525  let (q0, q1) = butterfly_sub_asym((q0.rshift1(), q0), q1);
526
527  // Stage 2
528  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
529  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
530  let (q2, q1) = RotatePi4AddAvg::kernel::<13, 13>(q2, q1, (11585, 11585));
531
532  store_coeffs!(output, q0, q1, q2, q3);
533}
534
535#[$m]
536$($s)* fn daala_fdct_ii_8<T: TxOperations>(
537  r0: T, r1: T, r2: T, r3: T, r4: T, r5: T, r6: T, r7: T, output: &mut [T],
538) {
539  // +/- Butterflies with asymmetric output.
540  let (r0h, r7) = butterfly_neg(r0, r7);
541  let (r1, r6h) = butterfly_add(r1, r6);
542  let (r2h, r5) = butterfly_neg(r2, r5);
543  let (r3, r4h) = butterfly_add(r3, r4);
544
545  // Embedded 4-point transforms with asymmetric input.
546  daala_fdct_ii_4_asym(r0h, r1, r2h, r3, &mut output[0..4]);
547  daala_fdst_iv_4_asym(r7, r6h, r5, r4h, &mut output[4..8]);
548  output[4..8].reverse();
549}
550
551#[$m]
552$($s)* fn daala_fdct8<T: TxOperations>(coeffs: &mut [T]) {
553  assert!(coeffs.len() >= 8);
554  let mut temp_out: [T; 8] = [T::zero(); 8];
555  daala_fdct_ii_8(
556    coeffs[0],
557    coeffs[1],
558    coeffs[2],
559    coeffs[3],
560    coeffs[4],
561    coeffs[5],
562    coeffs[6],
563    coeffs[7],
564    &mut temp_out,
565  );
566
567  coeffs[0] = temp_out[0];
568  coeffs[1] = temp_out[4];
569  coeffs[2] = temp_out[2];
570  coeffs[3] = temp_out[6];
571  coeffs[4] = temp_out[1];
572  coeffs[5] = temp_out[5];
573  coeffs[6] = temp_out[3];
574  coeffs[7] = temp_out[7];
575}
576
577#[$m]
578$($s)* fn daala_fdst_iv_8<T: TxOperations>(
579  r0: T, r1: T, r2: T, r3: T, r4: T, r5: T, r6: T, r7: T, output: &mut [T],
580) {
581  // Stage 0
582  // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576
583  // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363
584  //    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606
585  let (r0, r7) =
586    RotateAdd::kernel::<14, 14, 13>(r0, r7, (17911, 14699, 803));
587  // 20435/16384 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123
588  // 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650
589  //   1189/4096 = Cos[13*Pi/32]                 = 0.29028467725446233
590  let (r6, r1) =
591    RotateSub::kernel::<14, 15, 12>(r6, r1, (20435, 21845, 1189));
592  // 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526
593  //   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574
594  // 15447/32768 = Cos[11*Pi/32]                 = 0.47139673682599764
595  let (r2, r5) =
596    RotateAdd::kernel::<14, 13, 15>(r2, r5, (22173, 3363, 15447));
597  // 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826
598  //  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915
599  //   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455
600  let (r4, r3) =
601    RotateSub::kernel::<14, 14, 13>(r4, r3, (23059, 2271, 5197));
602
603  // Stage 1
604  let (r0, r3h) = butterfly_add(r0, r3);
605  let (r2, r1h) = butterfly_sub(r2, r1);
606  let (r5, r6h) = butterfly_add(r5, r6);
607  let (r7, r4h) = butterfly_sub(r7, r4);
608
609  // Stage 2
610  let (r7, r6) = butterfly_add_asym(r7, r6h);
611  let (r5, r3) = butterfly_add_asym(r5, r3h);
612  let (r2, r4) = butterfly_add_asym(r2, r4h);
613  let (r0, r1) = butterfly_sub_asym(r0, r1h);
614
615  // Stage 3
616  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
617  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
618  //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
619  let (r3, r4) =
620    RotateSubAvg::kernel::<13, 14, 12>(r3, r4, (10703, 8867, 3135));
621  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
622  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
623  //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
624  let (r2, r5) =
625    RotateNegAvg::kernel::<13, 14, 12>(r2, r5, (10703, 8867, 3135));
626  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
627  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
628  let (r1, r6) = RotatePi4SubAvg::kernel::<13, 13>(r1, r6, (11585, 11585));
629
630  store_coeffs!(output, r0, r1, r2, r3, r4, r5, r6, r7);
631}
632
633#[$m]
634$($s)* fn daala_fdst8<T: TxOperations>(coeffs: &mut [T]) {
635  assert!(coeffs.len() >= 8);
636  let mut temp_out: [T; 8] = [T::zero(); 8];
637  daala_fdst_iv_8(
638    coeffs[0],
639    coeffs[1],
640    coeffs[2],
641    coeffs[3],
642    coeffs[4],
643    coeffs[5],
644    coeffs[6],
645    coeffs[7],
646    &mut temp_out,
647  );
648
649  coeffs[0] = temp_out[0];
650  coeffs[1] = temp_out[4];
651  coeffs[2] = temp_out[2];
652  coeffs[3] = temp_out[6];
653  coeffs[4] = temp_out[1];
654  coeffs[5] = temp_out[5];
655  coeffs[6] = temp_out[3];
656  coeffs[7] = temp_out[7];
657}
658
659#[$m]
660$($s)* fn daala_fdst_iv_4<T: TxOperations>(
661  q0: T, q1: T, q2: T, q3: T, output: &mut [T],
662) {
663  // Stage 0
664  // 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545
665  //   4551/4096 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204
666  //  9041/32768 = Cos[7*Pi/16]*Sqrt[2]                  = 0.275899379282943
667  let (q0, q3) =
668    RotateAddShift::kernel::<14, 12, 11>(q0, q3, (13623, 4551, 565));
669  // 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304
670  // 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566
671  //   1609/2048 = Cos[5*Pi/16]*Sqrt[2]                  = 0.7856949583871021
672  let (q2, q1) =
673    RotateSubShift::kernel::<14, 15, 11>(q2, q1, (16069, 12785, 1609));
674
675  // Stage 1
676  let (q2, q3) = butterfly_sub_asym((q2.rshift1(), q2), q3);
677  let (q0, q1) = butterfly_sub_asym((q0.rshift1(), q0), q1);
678
679  // Stage 2
680  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
681  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
682  let (q2, q1) = RotatePi4AddAvg::kernel::<13, 13>(q2, q1, (11585, 11585));
683
684  store_coeffs!(output, q0, q1, q2, q3);
685}
686
687
688#[$m]
689$($s)* fn daala_fdct_ii_8_asym<T: TxOperations>(
690  r0h: T, r1: (T, T), r2h: T, r3: (T, T), r4h: T, r5: (T, T), r6h: T,
691  r7: (T, T), output: &mut [T],
692) {
693  // +/- Butterflies with asymmetric input.
694  let (r0, r7) = butterfly_neg_asym(r0h, r7);
695  let (r1, r6) = butterfly_sub_asym(r1, r6h);
696  let (r2, r5) = butterfly_neg_asym(r2h, r5);
697  let (r3, r4) = butterfly_sub_asym(r3, r4h);
698
699  // Embedded 4-point orthonormal transforms.
700  daala_fdct_ii_4(r0, r1, r2, r3, &mut output[0..4]);
701  daala_fdst_iv_4(r7, r6, r5, r4, &mut output[4..8]);
702  output[4..8].reverse();
703}
704
705#[$m]
706$($s)* fn daala_fdst_iv_8_asym<T: TxOperations>(
707  r0: (T, T), r1h: T, r2: (T, T), r3h: T, r4: (T, T), r5h: T, r6: (T, T),
708  r7h: T, output: &mut [T],
709) {
710  // Stage 0
711  // 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274
712  //   5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729
713  //  2271/16384 = Cos[15*Pi/32]*Sqrt[2]                   = 0.13861716919909
714  let (r0, r7) =
715    RotateAdd::half_kernel::<14, 12, 14>(r0, r7h, (12665, 5197, 2271));
716  // 14449/16384 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355
717  // 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995
718  //   3363/8192 = Cos[13*Pi/32]*Sqrt[2]                  = 0.410524527522357
719  let (r6, r1) =
720    RotateSub::half_kernel::<14, 15, 13>(r6, r1h, (14449, 30893, 3363));
721  // 15679/16384 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209
722  //   1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925
723  //   5461/8192 = Cos[11*Pi/32]*Sqrt[2]                  = 0.666655658477747
724  let (r2, r5) =
725    RotateAdd::half_kernel::<14, 11, 13>(r2, r5h, (15679, 1189, 5461));
726  // 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969
727  //    803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213
728  // 14699/16384 = Cos[9*Pi/32]*Sqrt[2]                  = 0.8971675863426364
729  let (r4, r3) =
730    RotateSub::half_kernel::<14, 12, 14>(r4, r3h, (16305, 803, 14699));
731
732  // Stage 1
733  let (r0, r3h) = butterfly_add(r0, r3);
734  let (r2, r1h) = butterfly_sub(r2, r1);
735  let (r5, r6h) = butterfly_add(r5, r6);
736  let (r7, r4h) = butterfly_sub(r7, r4);
737
738  // Stage 2
739  let (r7, r6) = butterfly_add_asym(r7, r6h);
740  let (r5, r3) = butterfly_add_asym(r5, r3h);
741  let (r2, r4) = butterfly_add_asym(r2, r4h);
742  let (r0, r1) = butterfly_sub_asym(r0, r1h);
743
744  // Stage 3
745  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
746  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
747  //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
748  let (r3, r4) =
749    RotateSubAvg::kernel::<9, 14, 12>(r3, r4, (669, 8867, 3135));
750  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
751  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
752  //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
753  let (r2, r5) =
754    RotateNegAvg::kernel::<9, 14, 12>(r2, r5, (669, 8867, 3135));
755  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
756  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
757  let (r1, r6) = RotatePi4SubAvg::kernel::<12, 13>(r1, r6, (5793, 11585));
758
759  store_coeffs!(output, r0, r1, r2, r3, r4, r5, r6, r7);
760}
761
762#[$m]
763$($s)* fn daala_fdct_ii_16<T: TxOperations>(
764  s0: T, s1: T, s2: T, s3: T, s4: T, s5: T, s6: T, s7: T, s8: T, s9: T, sa: T,
765  sb: T, sc: T, sd: T, se: T, sf: T, output: &mut [T],
766) {
767  // +/- Butterflies with asymmetric output.
768  let (s0h, sf) = butterfly_neg(s0, sf);
769  let (s1, seh) = butterfly_add(s1, se);
770  let (s2h, sd) = butterfly_neg(s2, sd);
771  let (s3, sch) = butterfly_add(s3, sc);
772  let (s4h, sb) = butterfly_neg(s4, sb);
773  let (s5, sah) = butterfly_add(s5, sa);
774  let (s6h, s9) = butterfly_neg(s6, s9);
775  let (s7, s8h) = butterfly_add(s7, s8);
776
777  // Embedded 8-point transforms with asymmetric input.
778  daala_fdct_ii_8_asym(s0h, s1, s2h, s3, s4h, s5, s6h, s7, &mut output[0..8]);
779  daala_fdst_iv_8_asym(sf, seh, sd, sch, sb, sah, s9, s8h, &mut output[8..16]);
780  output[8..16].reverse();
781}
782
783#[$m]
784$($s)* fn daala_fdct16<T: TxOperations>(coeffs: &mut [T]) {
785  assert!(coeffs.len() >= 16);
786  let mut temp_out: [T; 16] = [T::zero(); 16];
787  daala_fdct_ii_16(
788    coeffs[0],
789    coeffs[1],
790    coeffs[2],
791    coeffs[3],
792    coeffs[4],
793    coeffs[5],
794    coeffs[6],
795    coeffs[7],
796    coeffs[8],
797    coeffs[9],
798    coeffs[10],
799    coeffs[11],
800    coeffs[12],
801    coeffs[13],
802    coeffs[14],
803    coeffs[15],
804    &mut temp_out,
805  );
806
807  coeffs[0] = temp_out[0];
808  coeffs[1] = temp_out[8];
809  coeffs[2] = temp_out[4];
810  coeffs[3] = temp_out[12];
811  coeffs[4] = temp_out[2];
812  coeffs[5] = temp_out[10];
813  coeffs[6] = temp_out[6];
814  coeffs[7] = temp_out[14];
815  coeffs[8] = temp_out[1];
816  coeffs[9] = temp_out[9];
817  coeffs[10] = temp_out[5];
818  coeffs[11] = temp_out[13];
819  coeffs[12] = temp_out[3];
820  coeffs[13] = temp_out[11];
821  coeffs[14] = temp_out[7];
822  coeffs[15] = temp_out[15];
823}
824
825#[$m]
826$($s)* fn daala_fdst_iv_16<T: TxOperations>(
827  s0: T, s1: T, s2: T, s3: T, s4: T, s5: T, s6: T, s7: T, s8: T, s9: T, sa: T,
828  sb: T, sc: T, sd: T, se: T, sf: T, output: &mut [T],
829) {
830  // Stage 0
831  // 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496
832  //  11003/8192 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404
833  //  1137/16384 = Cos[31*Pi/64]*Sqrt[2]                   = 0.06939217050794
834  let (s0, sf) =
835    RotateAddShift::kernel::<15, 13, 14>(s0, sf, (24279, 11003, 1137));
836  // 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449
837  //   305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667
838  //  425/2048 = Cos[29*Pi/64]*Sqrt[2]                   = 0.2075082269882116
839  let (se, s1) =
840    RotateSubShift::kernel::<11, 8, 11>(se, s1, (1645, 305, 425));
841  // 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027
842  //   8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644
843  //   2815/8192 = Cos[27*Pi/64]*Sqrt[2]                   = 0.34362586580705
844  let (s2, sd) =
845    RotateAddShift::kernel::<14, 13, 13>(s2, sd, (14053, 8423, 2815));
846  // 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344
847  //   7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056
848  //   3903/8192 = Cos[25*Pi/64]*Sqrt[2]                   = 0.47643419969316
849  let (sc, s3) =
850    RotateSubShift::kernel::<14, 13, 13>(sc, s3, (14811, 7005, 3903));
851  // 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302
852  // 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444
853  //  9907/16384 = Cos[23*Pi/64]*Sqrt[2]                   = 0.60465421179080
854  let (s4, sb) =
855    RotateAddShift::kernel::<15, 14, 14>(s4, sb, (30853, 11039, 9907));
856  // 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454
857  //   3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264
858  //   1489/2048 = Cos[21*Pi/64]*Sqrt[2]                   = 0.72705107329128
859  let (sa, s5) =
860    RotateSubShift::kernel::<14, 13, 11>(sa, s5, (15893, 3981, 1489));
861  // 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478
862  //    601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072
863  // 13803/16384 = Cos[19*Pi/64]*Sqrt[2]                   = 0.84244603550942
864  let (s6, s9) =
865    RotateAddShift::kernel::<15, 11, 14>(s6, s9, (32413, 601, 13803));
866  // 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517
867  //    201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484
868  //   1945/2048 = Cos[17*Pi/64]*Sqrt[2]                   = 0.94972778187775
869  let (s8, s7) =
870    RotateSubShift::kernel::<15, 11, 11>(s8, s7, (32729, 201, 1945));
871
872  // Stage 1
873  let (s0, s7) = butterfly_sub_asym((s0.rshift1(), s0), s7);
874  let (s8, sf) = butterfly_sub_asym((s8.rshift1(), s8), sf);
875  let (s4, s3) = butterfly_add_asym((s4.rshift1(), s4), s3);
876  let (sc, sb) = butterfly_add_asym((sc.rshift1(), sc), sb);
877  let (s2, s5) = butterfly_sub_asym((s2.rshift1(), s2), s5);
878  let (sa, sd) = butterfly_sub_asym((sa.rshift1(), sa), sd);
879  let (s6, s1) = butterfly_add_asym((s6.rshift1(), s6), s1);
880  let (se, s9) = butterfly_add_asym((se.rshift1(), se), s9);
881
882  // Stage 2
883  let ((_s8h, s8), s4h) = butterfly_add(s8, s4);
884  let ((_s7h, s7), sbh) = butterfly_add(s7, sb);
885  let ((_sah, sa), s6h) = butterfly_sub(sa, s6);
886  let ((_s5h, s5), s9h) = butterfly_sub(s5, s9);
887  let (s0, s3h) = butterfly_add(s0, s3);
888  let (sd, seh) = butterfly_add(sd, se);
889  let (s2, s1h) = butterfly_sub(s2, s1);
890  let (sf, sch) = butterfly_sub(sf, sc);
891
892  // Stage 3
893  //     301/256 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
894  //   1609/2048 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
895  // 12785/32768 = 2*Cos[7*Pi/16]              = 0.3901806440322565
896  let (s8, s7) =
897    RotateAddAvg::kernel::<8, 11, 15>(s8, s7, (301, 1609, 12785));
898  // 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
899  // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
900  //  4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
901  let (s9, s6) =
902    RotateAdd::kernel::<13, 15, 13>(s9h, s6h, (11363, 9041, 4551));
903  //  5681/4096 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
904  // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
905  //  4551/4096 = 2*Cos[5*Pi/16]              = 1.1111404660392044
906  let (s5, sa) =
907    RotateNegAvg::kernel::<12, 15, 12>(s5, sa, (5681, 9041, 4551));
908  //   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
909  // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
910  //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
911  let (s4, sb) =
912    RotateNeg::kernel::<13, 14, 15>(s4h, sbh, (9633, 12873, 6393));
913
914  // Stage 4
915  let (s2, sc) = butterfly_add_asym(s2, sch);
916  let (s0, s1) = butterfly_sub_asym(s0, s1h);
917  let (sf, se) = butterfly_add_asym(sf, seh);
918  let (sd, s3) = butterfly_add_asym(sd, s3h);
919  let (s7, s6) = butterfly_add_asym((s7.rshift1(), s7), s6);
920  let (s8, s9) = butterfly_sub_asym((s8.rshift1(), s8), s9);
921  let (sa, sb) = butterfly_sub_asym((sa.rshift1(), sa), sb);
922  let (s5, s4) = butterfly_add_asym((s5.rshift1(), s5), s4);
923
924  // Stage 5
925  //    669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
926  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
927  //  3135/4096 = 2*Cos[7*Pi/8]             = 0.7653668647301796
928  let (sc, s3) =
929    RotateAddAvg::kernel::<9, 14, 12>(sc, s3, (669, 8867, 3135));
930  //    669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475
931  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
932  //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
933  let (s2, sd) =
934    RotateNegAvg::kernel::<9, 14, 12>(s2, sd, (669, 8867, 3135));
935  //  5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
936  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
937  let (sa, s5) = RotatePi4AddAvg::kernel::<12, 13>(sa, s5, (5793, 11585));
938  //  5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
939  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
940  let (s6, s9) = RotatePi4AddAvg::kernel::<12, 13>(s6, s9, (5793, 11585));
941  //  5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
942  // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
943  let (se, s1) = RotatePi4AddAvg::kernel::<12, 13>(se, s1, (5793, 11585));
944
945  store_coeffs!(
946    output, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf
947  );
948}
949
950#[$m]
951$($s)* fn daala_fdst16<T: TxOperations>(coeffs: &mut [T]) {
952  assert!(coeffs.len() >= 16);
953  let mut temp_out: [T; 16] = [T::zero(); 16];
954  daala_fdst_iv_16(
955    coeffs[0],
956    coeffs[1],
957    coeffs[2],
958    coeffs[3],
959    coeffs[4],
960    coeffs[5],
961    coeffs[6],
962    coeffs[7],
963    coeffs[8],
964    coeffs[9],
965    coeffs[10],
966    coeffs[11],
967    coeffs[12],
968    coeffs[13],
969    coeffs[14],
970    coeffs[15],
971    &mut temp_out,
972  );
973
974  coeffs[0] = temp_out[0];
975  coeffs[1] = temp_out[8];
976  coeffs[2] = temp_out[4];
977  coeffs[3] = temp_out[12];
978  coeffs[4] = temp_out[2];
979  coeffs[5] = temp_out[10];
980  coeffs[6] = temp_out[6];
981  coeffs[7] = temp_out[14];
982  coeffs[8] = temp_out[1];
983  coeffs[9] = temp_out[9];
984  coeffs[10] = temp_out[5];
985  coeffs[11] = temp_out[13];
986  coeffs[12] = temp_out[3];
987  coeffs[13] = temp_out[11];
988  coeffs[14] = temp_out[7];
989  coeffs[15] = temp_out[15];
990}
991
992#[$m]
993$($s)* fn daala_fdct_ii_16_asym<T: TxOperations>(
994  s0h: T, s1: (T, T), s2h: T, s3: (T, T), s4h: T, s5: (T, T), s6h: T,
995  s7: (T, T), s8h: T, s9: (T, T), sah: T, sb: (T, T), sch: T, sd: (T, T),
996  seh: T, sf: (T, T), output: &mut [T],
997) {
998  // +/- Butterflies with asymmetric input.
999  let (s0, sf) = butterfly_neg_asym(s0h, sf);
1000  let (s1, se) = butterfly_sub_asym(s1, seh);
1001  let (s2, sd) = butterfly_neg_asym(s2h, sd);
1002  let (s3, sc) = butterfly_sub_asym(s3, sch);
1003  let (s4, sb) = butterfly_neg_asym(s4h, sb);
1004  let (s5, sa) = butterfly_sub_asym(s5, sah);
1005  let (s6, s9) = butterfly_neg_asym(s6h, s9);
1006  let (s7, s8) = butterfly_sub_asym(s7, s8h);
1007
1008  // Embedded 8-point orthonormal transforms.
1009  daala_fdct_ii_8(s0, s1, s2, s3, s4, s5, s6, s7, &mut output[0..8]);
1010  daala_fdst_iv_8(sf, se, sd, sc, sb, sa, s9, s8, &mut output[8..16]);
1011  output[8..16].reverse();
1012}
1013
1014#[$m]
1015$($s)* fn daala_fdst_iv_16_asym<T: TxOperations>(
1016  s0: (T, T), s1h: T, s2: (T, T), s3h: T, s4: (T, T), s5h: T, s6: (T, T),
1017  s7h: T, s8: (T, T), s9h: T, sa: (T, T), sbh: T, sc: (T, T), sdh: T,
1018  se: (T, T), sfh: T, output: &mut [T],
1019) {
1020  // Stage 0
1021  //   1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953
1022  // 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088
1023  //   201/16384 = Cos[31*Pi/64]*2                   = 0.0981353486548360
1024  let (s0, sf) =
1025    RotateAddShift::half_kernel::<11, 15, 11>(s0, sfh, (1073, 62241, 201));
1026  // 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714
1027  // 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384
1028  //    601/2048 = Cos[29*Pi/64]*2                   = 0.2934609489107235
1029  let (se, s1) = RotateSubShift::half_kernel::<15, 15, 11>(
1030    se,
1031    s1h,
1032    (18611, 55211, 601),
1033  );
1034  //  9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039
1035  //   1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602
1036  //   3981/8192 = Cos[27*Pi/64]*2                   = 0.4859603598065277
1037  let (s2, sd) =
1038    RotateAddShift::half_kernel::<14, 10, 13>(s2, sdh, (9937, 1489, 3981));
1039  // 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205
1040  // 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014
1041  // 11039/16384 = Cos[25*Pi/64]*2                   = 0.6737797067844401
1042  let (sc, s3) = RotateSubShift::half_kernel::<14, 15, 14>(
1043    sc,
1044    s3h,
1045    (10473, 39627, 11039),
1046  );
1047  // 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628
1048  // 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225
1049  // 7005/8192 = Cos[23*Pi/64]*2                   = 0.8551101868605642
1050  let (s4, sb) =
1051    RotateAddShift::half_kernel::<12, 12, 13>(s4, sbh, (2727, 3903, 7005));
1052  // 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569
1053  // 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069
1054  // 8423/8192 = Cos[21*Pi/64]*2                   = 1.0282054883864433
1055  let (sa, s5) =
1056    RotateSubShift::half_kernel::<13, 12, 13>(sa, s5h, (5619, 2815, 8423));
1057  //   2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391
1058  // 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232
1059  //     305/256 = Cos[19*Pi/64]*2                   = 1.1913986089848667
1060  let (s6, s9) =
1061    RotateAddShift::half_kernel::<12, 15, 8>(s6, s9h, (2865, 13599, 305));
1062  // 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887
1063  //   1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816
1064  //  11003/8192 = Cos[17*Pi/64]*2                   = 1.3431179096940367
1065  let (s8, s7) = RotateSubShift::half_kernel::<15, 13, 13>(
1066    s8,
1067    s7h,
1068    (23143, 1137, 11003),
1069  );
1070
1071  // Stage 1
1072  let (s0, s7) = butterfly_sub_asym((s0.rshift1(), s0), s7);
1073  let (s8, sf) = butterfly_sub_asym((s8.rshift1(), s8), sf);
1074  let (s4, s3) = butterfly_add_asym((s4.rshift1(), s4), s3);
1075  let (sc, sb) = butterfly_add_asym((sc.rshift1(), sc), sb);
1076  let (s2, s5) = butterfly_sub_asym((s2.rshift1(), s2), s5);
1077  let (sa, sd) = butterfly_sub_asym((sa.rshift1(), sa), sd);
1078  let (s6, s1) = butterfly_add_asym((s6.rshift1(), s6), s1);
1079  let (se, s9) = butterfly_add_asym((se.rshift1(), se), s9);
1080
1081  // Stage 2
1082  let ((_s8h, s8), s4h) = butterfly_add(s8, s4);
1083  let ((_s7h, s7), sbh) = butterfly_add(s7, sb);
1084  let ((_sah, sa), s6h) = butterfly_sub(sa, s6);
1085  let ((_s5h, s5), s9h) = butterfly_sub(s5, s9);
1086  let (s0, s3h) = butterfly_add(s0, s3);
1087  let (sd, seh) = butterfly_add(sd, se);
1088  let (s2, s1h) = butterfly_sub(s2, s1);
1089  let (sf, sch) = butterfly_sub(sf, sc);
1090
1091  // Stage 3
1092  //   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1093  // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1094  //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1095  let (s8, s7) =
1096    RotateAdd::kernel::<13, 14, 15>(s8, s7, (9633, 12873, 6393));
1097  // 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1098  //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1099  //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1100  let (s9, s6) =
1101    RotateAdd::kernel::<14, 15, 13>(s9h, s6h, (22725, 9041, 4551));
1102  //  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1103  //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1104  //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1105  let (s5, sa) =
1106    RotateNeg::kernel::<13, 15, 13>(s5, sa, (11363, 9041, 4551));
1107  //  9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1108  // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1109  //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1110  let (s4, sb) =
1111    RotateNeg::kernel::<13, 14, 15>(s4h, sbh, (9633, 12873, 6393));
1112
1113  // Stage 4
1114  let (s2, sc) = butterfly_add_asym(s2, sch);
1115  let (s0, s1) = butterfly_sub_asym(s0, s1h);
1116  let (sf, se) = butterfly_add_asym(sf, seh);
1117  let (sd, s3) = butterfly_add_asym(sd, s3h);
1118  let (s7, s6) = butterfly_add_asym((s7.rshift1(), s7), s6);
1119  let (s8, s9) = butterfly_sub_asym((s8.rshift1(), s8), s9);
1120  let (sa, sb) = butterfly_sub_asym((sa.rshift1(), sa), sb);
1121  let (s5, s4) = butterfly_add_asym((s5.rshift1(), s5), s4);
1122
1123  // Stage 5
1124  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1125  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1126  //  3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1127  let (sc, s3) =
1128    RotateAdd::kernel::<13, 14, 13>(sc, s3, (10703, 8867, 3135));
1129  // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475
1130  // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1131  //  3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1132  let (s2, sd) =
1133    RotateNeg::kernel::<13, 14, 13>(s2, sd, (10703, 8867, 3135));
1134  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1135  //  5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1136  let (sa, s5) = RotatePi4Add::kernel::<13, 13>(sa, s5, (11585, 5793));
1137  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1138  //  5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1139  let (s6, s9) = RotatePi4Add::kernel::<13, 13>(s6, s9, (11585, 5793));
1140  // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1141  //  5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1142  let (se, s1) = RotatePi4Add::kernel::<13, 13>(se, s1, (11585, 5793));
1143
1144  store_coeffs!(
1145    output, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf
1146  );
1147}
1148
1149#[$m]
1150$($s)* fn daala_fdct_ii_32<T: TxOperations>(
1151  t0: T, t1: T, t2: T, t3: T, t4: T, t5: T, t6: T, t7: T, t8: T, t9: T, ta: T,
1152  tb: T, tc: T, td: T, te: T, tf: T, tg: T, th: T, ti: T, tj: T, tk: T, tl: T,
1153  tm: T, tn: T, to: T, tp: T, tq: T, tr: T, ts: T, tt: T, tu: T, tv: T,
1154  output: &mut [T],
1155) {
1156  // +/- Butterflies with asymmetric output.
1157  let (t0h, tv) = butterfly_neg(t0, tv);
1158  let (t1, tuh) = butterfly_add(t1, tu);
1159  let (t2h, tt) = butterfly_neg(t2, tt);
1160  let (t3, tsh) = butterfly_add(t3, ts);
1161  let (t4h, tr) = butterfly_neg(t4, tr);
1162  let (t5, tqh) = butterfly_add(t5, tq);
1163  let (t6h, tp) = butterfly_neg(t6, tp);
1164  let (t7, toh) = butterfly_add(t7, to);
1165  let (t8h, tn) = butterfly_neg(t8, tn);
1166  let (t9, tmh) = butterfly_add(t9, tm);
1167  let (tah, tl) = butterfly_neg(ta, tl);
1168  let (tb, tkh) = butterfly_add(tb, tk);
1169  let (tch, tj) = butterfly_neg(tc, tj);
1170  let (td, tih) = butterfly_add(td, ti);
1171  let (teh, th) = butterfly_neg(te, th);
1172  let (tf, tgh) = butterfly_add(tf, tg);
1173
1174  // Embedded 16-point transforms with asymmetric input.
1175  daala_fdct_ii_16_asym(
1176    t0h,
1177    t1,
1178    t2h,
1179    t3,
1180    t4h,
1181    t5,
1182    t6h,
1183    t7,
1184    t8h,
1185    t9,
1186    tah,
1187    tb,
1188    tch,
1189    td,
1190    teh,
1191    tf,
1192    &mut output[0..16],
1193  );
1194  daala_fdst_iv_16_asym(
1195    tv,
1196    tuh,
1197    tt,
1198    tsh,
1199    tr,
1200    tqh,
1201    tp,
1202    toh,
1203    tn,
1204    tmh,
1205    tl,
1206    tkh,
1207    tj,
1208    tih,
1209    th,
1210    tgh,
1211    &mut output[16..32],
1212  );
1213  output[16..32].reverse();
1214}
1215
1216#[$m]
1217$($s)* fn daala_fdct32<T: TxOperations>(coeffs: &mut [T]) {
1218  assert!(coeffs.len() >= 32);
1219  let mut temp_out: [T; 32] = [T::zero(); 32];
1220  daala_fdct_ii_32(
1221    coeffs[0],
1222    coeffs[1],
1223    coeffs[2],
1224    coeffs[3],
1225    coeffs[4],
1226    coeffs[5],
1227    coeffs[6],
1228    coeffs[7],
1229    coeffs[8],
1230    coeffs[9],
1231    coeffs[10],
1232    coeffs[11],
1233    coeffs[12],
1234    coeffs[13],
1235    coeffs[14],
1236    coeffs[15],
1237    coeffs[16],
1238    coeffs[17],
1239    coeffs[18],
1240    coeffs[19],
1241    coeffs[20],
1242    coeffs[21],
1243    coeffs[22],
1244    coeffs[23],
1245    coeffs[24],
1246    coeffs[25],
1247    coeffs[26],
1248    coeffs[27],
1249    coeffs[28],
1250    coeffs[29],
1251    coeffs[30],
1252    coeffs[31],
1253    &mut temp_out,
1254  );
1255
1256  coeffs[0] = temp_out[0];
1257  coeffs[1] = temp_out[16];
1258  coeffs[2] = temp_out[8];
1259  coeffs[3] = temp_out[24];
1260  coeffs[4] = temp_out[4];
1261  coeffs[5] = temp_out[20];
1262  coeffs[6] = temp_out[12];
1263  coeffs[7] = temp_out[28];
1264  coeffs[8] = temp_out[2];
1265  coeffs[9] = temp_out[18];
1266  coeffs[10] = temp_out[10];
1267  coeffs[11] = temp_out[26];
1268  coeffs[12] = temp_out[6];
1269  coeffs[13] = temp_out[22];
1270  coeffs[14] = temp_out[14];
1271  coeffs[15] = temp_out[30];
1272  coeffs[16] = temp_out[1];
1273  coeffs[17] = temp_out[17];
1274  coeffs[18] = temp_out[9];
1275  coeffs[19] = temp_out[25];
1276  coeffs[20] = temp_out[5];
1277  coeffs[21] = temp_out[21];
1278  coeffs[22] = temp_out[13];
1279  coeffs[23] = temp_out[29];
1280  coeffs[24] = temp_out[3];
1281  coeffs[25] = temp_out[19];
1282  coeffs[26] = temp_out[11];
1283  coeffs[27] = temp_out[27];
1284  coeffs[28] = temp_out[7];
1285  coeffs[29] = temp_out[23];
1286  coeffs[30] = temp_out[15];
1287  coeffs[31] = temp_out[31];
1288}
1289
1290#[$m]
1291$($s)* fn daala_fdct_ii_32_asym<T: TxOperations>(
1292  t0h: T, t1: (T, T), t2h: T, t3: (T, T), t4h: T, t5: (T, T), t6h: T,
1293  t7: (T, T), t8h: T, t9: (T, T), tah: T, tb: (T, T), tch: T, td: (T, T),
1294  teh: T, tf: (T, T), tgh: T, th: (T, T), tih: T, tj: (T, T), tkh: T,
1295  tl: (T, T), tmh: T, tn: (T, T), toh: T, tp: (T, T), tqh: T, tr: (T, T),
1296  tsh: T, tt: (T, T), tuh: T, tv: (T, T), output: &mut [T],
1297) {
1298  // +/- Butterflies with asymmetric input.
1299  let (t0, tv) = butterfly_neg_asym(t0h, tv);
1300  let (t1, tu) = butterfly_sub_asym(t1, tuh);
1301  let (t2, tt) = butterfly_neg_asym(t2h, tt);
1302  let (t3, ts) = butterfly_sub_asym(t3, tsh);
1303  let (t4, tr) = butterfly_neg_asym(t4h, tr);
1304  let (t5, tq) = butterfly_sub_asym(t5, tqh);
1305  let (t6, tp) = butterfly_neg_asym(t6h, tp);
1306  let (t7, to) = butterfly_sub_asym(t7, toh);
1307  let (t8, tn) = butterfly_neg_asym(t8h, tn);
1308  let (t9, tm) = butterfly_sub_asym(t9, tmh);
1309  let (ta, tl) = butterfly_neg_asym(tah, tl);
1310  let (tb, tk) = butterfly_sub_asym(tb, tkh);
1311  let (tc, tj) = butterfly_neg_asym(tch, tj);
1312  let (td, ti) = butterfly_sub_asym(td, tih);
1313  let (te, th) = butterfly_neg_asym(teh, th);
1314  let (tf, tg) = butterfly_sub_asym(tf, tgh);
1315
1316  // Embedded 16-point orthonormal transforms.
1317  daala_fdct_ii_16(
1318    t0,
1319    t1,
1320    t2,
1321    t3,
1322    t4,
1323    t5,
1324    t6,
1325    t7,
1326    t8,
1327    t9,
1328    ta,
1329    tb,
1330    tc,
1331    td,
1332    te,
1333    tf,
1334    &mut output[0..16],
1335  );
1336  daala_fdst_iv_16(
1337    tv,
1338    tu,
1339    tt,
1340    ts,
1341    tr,
1342    tq,
1343    tp,
1344    to,
1345    tn,
1346    tm,
1347    tl,
1348    tk,
1349    tj,
1350    ti,
1351    th,
1352    tg,
1353    &mut output[16..32],
1354  );
1355  output[16..32].reverse();
1356}
1357
1358#[$m]
1359$($s)* fn daala_fdst_iv_32_asym<T: TxOperations>(
1360  t0: (T, T), t1h: T, t2: (T, T), t3h: T, t4: (T, T), t5h: T, t6: (T, T),
1361  t7h: T, t8: (T, T), t9h: T, ta: (T, T), tbh: T, tc: (T, T), tdh: T,
1362  te: (T, T), tfh: T, tg: (T, T), thh: T, ti: (T, T), tjh: T, tk: (T, T),
1363  tlh: T, tm: (T, T), tnh: T, to: (T, T), tph: T, tq: (T, T), trh: T,
1364  ts: (T, T), tth: T, tu: (T, T), tvh: T, output: &mut [T],
1365) {
1366  // Stage 0
1367  //   5933/8192 = (Sin[63*Pi/128] + Cos[63*Pi/128])/Sqrt[2] = 0.72424708295147
1368  // 22595/16384 = (Sin[63*Pi/128] - Cos[63*Pi/128])*Sqrt[2] = 1.37908108947413
1369  //  1137/32768 = Cos[63*Pi/128]*Sqrt[2]                    = 0.03470653821440
1370  let (t0, tv) =
1371    RotateAdd::half_kernel::<13, 14, 15>(t0, tvh, (5933, 22595, 1137));
1372  //   6203/8192 = (Sin[61*Pi/128] + Cos[61*Pi/128])/Sqrt[2] = 0.75720884650648
1373  // 21403/16384 = (Sin[61*Pi/128] - Cos[61*Pi/128])*Sqrt[2] = 1.30634568590755
1374  //  3409/32768 = Cos[61*Pi/128]*Sqrt[2]                    = 0.10403600355271
1375  let (tu, t1) =
1376    RotateSub::half_kernel::<13, 14, 15>(tu, t1h, (6203, 21403, 3409));
1377  // 25833/32768 = (Sin[59*Pi/128] + Cos[59*Pi/128])/Sqrt[2] = 0.78834642762661
1378  //     315/256 = (Sin[59*Pi/128] - Cos[59*Pi/128])*Sqrt[2] = 1.23046318116125
1379  //  5673/32768 = Cos[59*Pi/128]*Sqrt[2]                    = 0.17311483704598
1380  let (t2, tt) =
1381    RotateAdd::half_kernel::<15, 8, 15>(t2, tth, (25833, 315, 5673));
1382  // 26791/32768 = (Sin[57*Pi/128] + Cos[57*Pi/128])/Sqrt[2] = 0.81758481315158
1383  //   4717/4096 = (Sin[57*Pi/128] - Cos[57*Pi/128])*Sqrt[2] = 1.15161638283569
1384  //  7923/32768 = Cos[57*Pi/128]*Sqrt[2]                    = 0.24177662173374
1385  let (ts, t3) =
1386    RotateSub::half_kernel::<15, 12, 15>(ts, t3h, (26791, 4717, 7923));
1387  //   6921/8192 = (Sin[55*Pi/128] + Cos[55*Pi/128])/Sqrt[2] = 0.84485356524971
1388  // 17531/16384 = (Sin[55*Pi/128] - Cos[55*Pi/128])*Sqrt[2] = 1.06999523977419
1389  // 10153/32768 = Cos[55*Pi/128]*Sqrt[2]                    = 0.30985594536261
1390  let (t4, tr) =
1391    RotateAdd::half_kernel::<13, 14, 15>(t4, trh, (6921, 17531, 10153));
1392  // 28511/32768 = (Sin[53*Pi/128] + Cos[53*Pi/128])/Sqrt[2] = 0.87008699110871
1393  // 32303/32768 = (Sin[53*Pi/128] - Cos[53*Pi/128])*Sqrt[2] = 0.98579638445957
1394  //   1545/4096 = Cos[53*Pi/128]*Sqrt[2]                    = 0.37718879887893
1395  let (tq, t5) =
1396    RotateSub::half_kernel::<15, 15, 12>(tq, t5h, (28511, 32303, 1545));
1397  // 29269/32768 = (Sin[51*Pi/128] + Cos[51*Pi/128])/Sqrt[2] = 0.89322430119552
1398  // 14733/16384 = (Sin[51*Pi/128] - Cos[51*Pi/128])*Sqrt[2] = 0.89922265930921
1399  //   1817/4096 = Cos[51*Pi/128]*Sqrt[2]                    = 0.44361297154091
1400  let (t6, tp) =
1401    RotateAdd::half_kernel::<15, 14, 12>(t6, tph, (29269, 14733, 1817));
1402  // 29957/32768 = (Sin[49*Pi/128] + Cos[49*Pi/128])/Sqrt[2] = 0.91420975570353
1403  // 13279/16384 = (Sin[49*Pi/128] - Cos[49*Pi/128])*Sqrt[2] = 0.81048262800998
1404  //  8339/16384 = Cos[49*Pi/128]*Sqrt[2]                    = 0.50896844169854
1405  let (to, t7) =
1406    RotateSub::half_kernel::<15, 14, 14>(to, t7h, (29957, 13279, 8339));
1407  //   7643/8192 = (Sin[47*Pi/128] + Cos[47*Pi/128])/Sqrt[2] = 0.93299279883474
1408  // 11793/16384 = (Sin[47*Pi/128] - Cos[47*Pi/128])*Sqrt[2] = 0.71979007306998
1409  // 18779/32768 = Cos[47*Pi/128]*Sqrt[2]                    = 0.57309776229975
1410  let (t8, tn) =
1411    RotateAdd::half_kernel::<13, 14, 15>(t8, tnh, (7643, 11793, 18779));
1412  // 15557/16384 = (Sin[45*Pi/128] + Cos[45*Pi/128])/Sqrt[2] = 0.94952818059304
1413  // 20557/32768 = (Sin[45*Pi/128] - Cos[45*Pi/128])*Sqrt[2] = 0.62736348079778
1414  // 20835/32768 = Cos[45*Pi/128]*Sqrt[2]                    = 0.63584644019415
1415  let (tm, t9) =
1416    RotateSub::half_kernel::<14, 15, 15>(tm, t9h, (15557, 20557, 20835));
1417  // 31581/32768 = (Sin[43*Pi/128] + Cos[43*Pi/128])/Sqrt[2] = 0.96377606579544
1418  // 17479/32768 = (Sin[43*Pi/128] - Cos[43*Pi/128])*Sqrt[2] = 0.53342551494980
1419  // 22841/32768 = Cos[43*Pi/128]*Sqrt[2]                    = 0.69706330832054
1420  let (ta, tl) =
1421    RotateAdd::half_kernel::<15, 15, 15>(ta, tlh, (31581, 17479, 22841));
1422  //   7993/8192 = (Sin[41*Pi/128] + Cos[41*Pi/128])/Sqrt[2] = 0.97570213003853
1423  // 14359/32768 = (Sin[41*Pi/128] - Cos[41*Pi/128])*Sqrt[2] = 0.43820248031374
1424  //   3099/4096 = Cos[41*Pi/128]*Sqrt[2]                    = 0.75660088988166
1425  let (tk, tb) =
1426    RotateSub::half_kernel::<13, 15, 12>(tk, tbh, (7993, 14359, 3099));
1427  // 16143/16384 = (Sin[39*Pi/128] + Cos[39*Pi/128])/Sqrt[2] = 0.98527764238894
1428  //   2801/8192 = (Sin[39*Pi/128] - Cos[39*Pi/128])*Sqrt[2] = 0.34192377752060
1429  // 26683/32768 = Cos[39*Pi/128]*Sqrt[2]                    = 0.81431575362864
1430  let (tc, tj) =
1431    RotateAdd::half_kernel::<14, 13, 15>(tc, tjh, (16143, 2801, 26683));
1432  // 16261/16384 = (Sin[37*Pi/128] + Cos[37*Pi/128])/Sqrt[2] = 0.99247953459871
1433  //  4011/16384 = (Sin[37*Pi/128] - Cos[37*Pi/128])*Sqrt[2] = 0.24482135039843
1434  // 14255/16384 = Cos[37*Pi/128]*Sqrt[2]                    = 0.87006885939949
1435  let (ti, td) =
1436    RotateSub::half_kernel::<14, 14, 14>(ti, tdh, (16261, 4011, 14255));
1437  // 32679/32768 = (Sin[35*Pi/128] + Cos[35*Pi/128])/Sqrt[2] = 0.99729045667869
1438  //  4821/32768 = (Sin[35*Pi/128] - Cos[35*Pi/128])*Sqrt[2] = 0.14712912719933
1439  // 30269/32768 = Cos[35*Pi/128]*Sqrt[2]                    = 0.92372589307902
1440  let (te, th) =
1441    RotateAdd::half_kernel::<15, 15, 15>(te, thh, (32679, 4821, 30269));
1442  // 16379/16384 = (Sin[33*Pi/128] + Cos[33*Pi/128])/Sqrt[2] = 0.99969881869620
1443  //    201/4096 = (Sin[33*Pi/128] - Cos[33*Pi/128])*Sqrt[2] = 0.04908245704582
1444  // 15977/16384 = Cos[33*Pi/128]*Sqrt[2]                    = 0.97515759017329
1445  let (tg, tf) =
1446    RotateSub::half_kernel::<14, 12, 14>(tg, tfh, (16379, 201, 15977));
1447
1448  // Stage 1
1449  let (t0, tfh) = butterfly_add(t0, tf);
1450  let (tv, tgh) = butterfly_sub(tv, tg);
1451  let (th, tuh) = butterfly_add(th, tu);
1452  let (te, t1h) = butterfly_sub(te, t1);
1453  let (t2, tdh) = butterfly_add(t2, td);
1454  let (tt, tih) = butterfly_sub(tt, ti);
1455  let (tj, tsh) = butterfly_add(tj, ts);
1456  let (tc, t3h) = butterfly_sub(tc, t3);
1457  let (t4, tbh) = butterfly_add(t4, tb);
1458  let (tr, tkh) = butterfly_sub(tr, tk);
1459  let (tl, tqh) = butterfly_add(tl, tq);
1460  let (ta, t5h) = butterfly_sub(ta, t5);
1461  let (t6, t9h) = butterfly_add(t6, t9);
1462  let (tp, tmh) = butterfly_sub(tp, tm);
1463  let (tn, toh) = butterfly_add(tn, to);
1464  let (t8, t7h) = butterfly_sub(t8, t7);
1465
1466  // Stage 2
1467  let (t0, t7) = butterfly_sub_asym(t0, t7h);
1468  let (tv, to) = butterfly_add_asym(tv, toh);
1469  let (tp, tu) = butterfly_sub_asym(tp, tuh);
1470  let (t6, t1) = butterfly_add_asym(t6, t1h);
1471  let (t2, t5) = butterfly_sub_asym(t2, t5h);
1472  let (tt, tq) = butterfly_add_asym(tt, tqh);
1473  let (tr, ts) = butterfly_sub_asym(tr, tsh);
1474  let (t4, t3) = butterfly_add_asym(t4, t3h);
1475  let (t8, tg) = butterfly_add_asym(t8, tgh);
1476  let (te, tm) = butterfly_sub_asym(te, tmh);
1477  let (tn, tf) = butterfly_add_asym(tn, tfh);
1478  let (th, t9) = butterfly_sub_asym(th, t9h);
1479  let (ta, ti) = butterfly_add_asym(ta, tih);
1480  let (tc, tk) = butterfly_sub_asym(tc, tkh);
1481  let (tl, td) = butterfly_add_asym(tl, tdh);
1482  let (tj, tb) = butterfly_sub_asym(tj, tbh);
1483
1484  // Stage 3
1485  // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576
1486  // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363
1487  //    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606
1488  let (tf, tg) =
1489    RotateSub::kernel::<14, 14, 13>(tf, tg, (17911, 14699, 803));
1490  //  10217/8192 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.2472250129866712
1491  //   5461/8192 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.6666556584777465
1492  //   1189/4096 = Cos[13*Pi/32]                 = 0.2902846772544623
1493  let (th, te) =
1494    RotateAdd::kernel::<13, 13, 12>(th, te, (10217, 5461, 1189));
1495  //   5543/4096 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526
1496  //   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574
1497  //  7723/16384 = Cos[11*Pi/32]                 = 0.4713967368259976
1498  let (ti, td) =
1499    RotateAdd::kernel::<12, 13, 14>(ti, td, (5543, 3363, 7723));
1500  //  11529/8192 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826
1501  //  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915
1502  //   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455
1503  let (tc, tj) =
1504    RotateSub::kernel::<13, 14, 13>(tc, tj, (11529, 2271, 5197));
1505  //  11529/8192 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826
1506  //  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915
1507  //   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455
1508  let (tb, tk) =
1509    RotateNeg::kernel::<13, 14, 13>(tb, tk, (11529, 2271, 5197));
1510  //   5543/4096 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526
1511  //   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574
1512  //  7723/16384 = Cos[11*Pi/32]                 = 0.4713967368259976
1513  let (ta, tl) =
1514    RotateNeg::kernel::<12, 13, 14>(ta, tl, (5543, 3363, 7723));
1515  //  10217/8192 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.2472250129866712
1516  //   5461/8192 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.6666556584777465
1517  //   1189/4096 = Cos[13*Pi/32]                 = 0.2902846772544623
1518  let (t9, tm) =
1519    RotateNeg::kernel::<13, 13, 12>(t9, tm, (10217, 5461, 1189));
1520  // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576
1521  // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363
1522  //    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606
1523  let (t8, tn) =
1524    RotateNeg::kernel::<14, 14, 13>(t8, tn, (17911, 14699, 803));
1525
1526  // Stage 4
1527  let (t3, t0h) = butterfly_sub(t3, t0);
1528  let (ts, tvh) = butterfly_add(ts, tv);
1529  let (tu, tth) = butterfly_sub(tu, tt);
1530  let (t1, t2h) = butterfly_add(t1, t2);
1531  let ((_toh, to), t4h) = butterfly_add(to, t4);
1532  let ((_tqh, tq), t6h) = butterfly_sub(tq, t6);
1533  let ((_t7h, t7), trh) = butterfly_add(t7, tr);
1534  let ((_t5h, t5), tph) = butterfly_sub(t5, tp);
1535  let (tb, t8h) = butterfly_sub(tb, t8);
1536  let (tk, tnh) = butterfly_add(tk, tn);
1537  let (tm, tlh) = butterfly_sub(tm, tl);
1538  let (t9, tah) = butterfly_add(t9, ta);
1539  let (tf, tch) = butterfly_sub(tf, tc);
1540  let (tg, tjh) = butterfly_add(tg, tj);
1541  let (ti, thh) = butterfly_sub(ti, th);
1542  let (td, teh) = butterfly_add(td, te);
1543
1544  // Stage 5
1545  //     301/256 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1546  //   1609/2048 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1547  //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1548  let (to, t7) = RotateAdd::kernel::<8, 11, 15>(to, t7, (301, 1609, 6393));
1549  //  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1550  //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1551  //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1552  let (tph, t6h) =
1553    RotateAdd::kernel::<13, 15, 13>(tph, t6h, (11363, 9041, 4551));
1554  //   5681/4096 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1555  //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1556  //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1557  let (t5, tq) =
1558    RotateNeg::kernel::<12, 15, 13>(t5, tq, (5681, 9041, 4551));
1559  //   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1560  // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1561  //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1562  let (t4h, trh) =
1563    RotateNeg::kernel::<13, 14, 15>(t4h, trh, (9633, 12873, 6393));
1564
1565  // Stage 6
1566  let (t1, t0) = butterfly_add_asym(t1, t0h);
1567  let (tu, tv) = butterfly_sub_asym(tu, tvh);
1568  let (ts, t2) = butterfly_sub_asym(ts, t2h);
1569  let (t3, tt) = butterfly_sub_asym(t3, tth);
1570  let (t5, t4) = butterfly_add_asym((t5.rshift1(), t5), t4h);
1571  let (tq, tr) = butterfly_sub_asym((tq.rshift1(), tq), trh);
1572  let (t7, t6) = butterfly_add_asym((t7.rshift1(), t7), t6h);
1573  let (to, tp) = butterfly_sub_asym((to.rshift1(), to), tph);
1574  let (t9, t8) = butterfly_add_asym(t9, t8h);
1575  let (tm, tn) = butterfly_sub_asym(tm, tnh);
1576  let (tk, ta) = butterfly_sub_asym(tk, tah);
1577  let (tb, tl) = butterfly_sub_asym(tb, tlh);
1578  let (ti, tc) = butterfly_add_asym(ti, tch);
1579  let (td, tj) = butterfly_add_asym(td, tjh);
1580  let (tf, te) = butterfly_add_asym(tf, teh);
1581  let (tg, th) = butterfly_sub_asym(tg, thh);
1582
1583  // Stage 7
1584  //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1585  //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1586  //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1587  let (t2, tt) = RotateNeg::kernel::<9, 14, 13>(t2, tt, (669, 8867, 3135));
1588  //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1589  //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1590  //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1591  let (ts, t3) = RotateAdd::kernel::<9, 14, 13>(ts, t3, (669, 8867, 3135));
1592  //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1593  //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1594  //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1595  let (ta, tl) = RotateNeg::kernel::<9, 14, 13>(ta, tl, (669, 8867, 3135));
1596  //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1597  //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1598  //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1599  let (tk, tb) = RotateAdd::kernel::<9, 14, 13>(tk, tb, (669, 8867, 3135));
1600  //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1601  //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1602  //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1603  let (tc, tj) = RotateAdd::kernel::<9, 14, 13>(tc, tj, (669, 8867, 3135));
1604  //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1605  //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1606  //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1607  let (ti, td) = RotateNeg::kernel::<9, 14, 13>(ti, td, (669, 8867, 3135));
1608  //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1609  //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1610  let (tu, t1) = RotatePi4Add::kernel::<12, 13>(tu, t1, (5793, 5793));
1611  //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1612  //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1613  let (tq, t5) = RotatePi4Add::kernel::<12, 13>(tq, t5, (5793, 5793));
1614  //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1615  //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1616  let (tp, t6) = RotatePi4Sub::kernel::<12, 13>(tp, t6, (5793, 5793));
1617  //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1618  //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1619  let (tm, t9) = RotatePi4Add::kernel::<12, 13>(tm, t9, (5793, 5793));
1620  //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1621  //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1622  let (te, th) = RotatePi4Add::kernel::<12, 13>(te, th, (5793, 5793));
1623
1624  store_coeffs!(
1625    output, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf,
1626    tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv
1627  );
1628}
1629
1630#[allow(clippy::identity_op)]
1631#[$m]
1632$($s)* fn daala_fdct64<T: TxOperations>(coeffs: &mut [T]) {
1633  assert!(coeffs.len() >= 64);
1634  // Use arrays to avoid ridiculous variable names
1635  let mut asym: [(T, T); 32] = [(T::zero(), T::zero()); 32];
1636  let mut half: [T; 32] = [T::zero(); 32];
1637  // +/- Butterflies with asymmetric output.
1638  {
1639    #[$m]
1640    #[inline]
1641    $($s)* fn butterfly_pair<T: TxOperations>(
1642      half: &mut [T; 32], asym: &mut [(T, T); 32], input: &[T], i: usize
1643    ) {
1644      let j = i * 2;
1645      let (ah, c) = butterfly_neg(input[j], input[63 - j]);
1646      let (b, dh) = butterfly_add(input[j + 1], input[63 - j - 1]);
1647      half[i] = ah;
1648      half[31 - i] = dh;
1649      asym[i] = b;
1650      asym[31 - i] = c;
1651    }
1652    butterfly_pair(&mut half, &mut asym, coeffs, 0);
1653    butterfly_pair(&mut half, &mut asym, coeffs, 1);
1654    butterfly_pair(&mut half, &mut asym, coeffs, 2);
1655    butterfly_pair(&mut half, &mut asym, coeffs, 3);
1656    butterfly_pair(&mut half, &mut asym, coeffs, 4);
1657    butterfly_pair(&mut half, &mut asym, coeffs, 5);
1658    butterfly_pair(&mut half, &mut asym, coeffs, 6);
1659    butterfly_pair(&mut half, &mut asym, coeffs, 7);
1660    butterfly_pair(&mut half, &mut asym, coeffs, 8);
1661    butterfly_pair(&mut half, &mut asym, coeffs, 9);
1662    butterfly_pair(&mut half, &mut asym, coeffs, 10);
1663    butterfly_pair(&mut half, &mut asym, coeffs, 11);
1664    butterfly_pair(&mut half, &mut asym, coeffs, 12);
1665    butterfly_pair(&mut half, &mut asym, coeffs, 13);
1666    butterfly_pair(&mut half, &mut asym, coeffs, 14);
1667    butterfly_pair(&mut half, &mut asym, coeffs, 15);
1668  }
1669
1670  let mut temp_out: [T; 64] = [T::zero(); 64];
1671  // Embedded 2-point transforms with asymmetric input.
1672  daala_fdct_ii_32_asym(
1673    half[0],
1674    asym[0],
1675    half[1],
1676    asym[1],
1677    half[2],
1678    asym[2],
1679    half[3],
1680    asym[3],
1681    half[4],
1682    asym[4],
1683    half[5],
1684    asym[5],
1685    half[6],
1686    asym[6],
1687    half[7],
1688    asym[7],
1689    half[8],
1690    asym[8],
1691    half[9],
1692    asym[9],
1693    half[10],
1694    asym[10],
1695    half[11],
1696    asym[11],
1697    half[12],
1698    asym[12],
1699    half[13],
1700    asym[13],
1701    half[14],
1702    asym[14],
1703    half[15],
1704    asym[15],
1705    &mut temp_out[0..32],
1706  );
1707  daala_fdst_iv_32_asym(
1708    asym[31],
1709    half[31],
1710    asym[30],
1711    half[30],
1712    asym[29],
1713    half[29],
1714    asym[28],
1715    half[28],
1716    asym[27],
1717    half[27],
1718    asym[26],
1719    half[26],
1720    asym[25],
1721    half[25],
1722    asym[24],
1723    half[24],
1724    asym[23],
1725    half[23],
1726    asym[22],
1727    half[22],
1728    asym[21],
1729    half[21],
1730    asym[20],
1731    half[20],
1732    asym[19],
1733    half[19],
1734    asym[18],
1735    half[18],
1736    asym[17],
1737    half[17],
1738    asym[16],
1739    half[16],
1740    &mut temp_out[32..64],
1741  );
1742  temp_out[32..64].reverse();
1743
1744  // Store a reordered version of output in temp_out
1745  #[$m]
1746  #[inline]
1747  $($s)* fn reorder_4<T: TxOperations>(
1748    output: &mut [T], i: usize, tmp: [T; 64], j: usize
1749  ) {
1750    output[0 + i * 4] = tmp[0 + j];
1751    output[1 + i * 4] = tmp[32 + j];
1752    output[2 + i * 4] = tmp[16 + j];
1753    output[3 + i * 4] = tmp[48 + j];
1754  }
1755  reorder_4(coeffs, 0, temp_out, 0);
1756  reorder_4(coeffs, 1, temp_out, 8);
1757  reorder_4(coeffs, 2, temp_out, 4);
1758  reorder_4(coeffs, 3, temp_out, 12);
1759  reorder_4(coeffs, 4, temp_out, 2);
1760  reorder_4(coeffs, 5, temp_out, 10);
1761  reorder_4(coeffs, 6, temp_out, 6);
1762  reorder_4(coeffs, 7, temp_out, 14);
1763
1764  reorder_4(coeffs, 8, temp_out, 1);
1765  reorder_4(coeffs, 9, temp_out, 9);
1766  reorder_4(coeffs, 10, temp_out, 5);
1767  reorder_4(coeffs, 11, temp_out, 13);
1768  reorder_4(coeffs, 12, temp_out, 3);
1769  reorder_4(coeffs, 13, temp_out, 11);
1770  reorder_4(coeffs, 14, temp_out, 7);
1771  reorder_4(coeffs, 15, temp_out, 15);
1772}
1773
1774#[$m]
1775$($s)* fn fidentity<T: TxOperations>(_coeffs: &mut [T]) {}
1776
1777#[$m]
1778$($s)* fn fwht4<T: TxOperations>(coeffs: &mut [T]) {
1779  assert!(coeffs.len() >= 4);
1780  let x0 = coeffs[0];
1781  let x1 = coeffs[1];
1782  let x2 = coeffs[2];
1783  let x3 = coeffs[3];
1784
1785  let s0 = x0.add(x1);
1786  let s1 = x3.sub(x2);
1787  let s2 = s0.sub_avg(s1);
1788
1789  let q1 = s2.sub(x2);
1790  let q0 = s0.sub(q1);
1791  let q3 = s2.sub(x1);
1792  let q2 = s1.add(q3);
1793
1794  store_coeffs!(coeffs, q0, q1, q2, q3);
1795}
1796
1797}
1798
1799}