Skip to main content

zune_jpeg/
idct.rs

1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! Routines for IDCT
10//!
11//! Essentially we provide 2 routines for IDCT, a scalar implementation and a not super optimized
12//! AVX2 one, i'll talk about them here.
13//!
14//! There are 2 reasons why we have the avx one
15//! 1. No one compiles with -C target-features=avx2 hence binaries won't probably take advantage(even
16//! if it exists).
17//! 2. AVX employs zero short circuit in a way the scalar code cannot employ it.
18//!     - AVX does this by checking for MCU's whose 63 AC coefficients are zero and if true, it writes
19//!        values directly, if false, it goes the long way of calculating.
20//!     -   Although this can be trivially implemented in the scalar version, it  generates code
21//!         I'm not happy width(scalar version that basically loops and that is too many branches for me)
22//!         The avx one does a better job of using bitwise or's with (`_mm256_or_si256`) which is magnitudes of faster
23//!         than anything I could come up with
24//!
25//! The AVX code also has some cool transpose_u16 instructions which look so complicated to be cool
26//! (spoiler alert, i barely understand how it works, that's why I credited the owner).
27//!
28#![allow(
29    clippy::excessive_precision,
30    clippy::unreadable_literal,
31    clippy::module_name_repetitions,
32    unused_parens,
33    clippy::wildcard_imports
34)]
35
36use zune_core::log::debug;
37use zune_core::options::DecoderOptions;
38
39use crate::decoder::IDCTPtr;
40use crate::idct::scalar::{idct_int, idct_int_1x1};
41
42#[cfg(feature = "x86")]
43pub mod avx2;
44#[cfg(feature = "neon")]
45pub mod neon;
46
47pub mod scalar;
48
49/// Choose an appropriate IDCT function
50#[allow(unused_variables)]
51pub fn choose_idct_func(options: &DecoderOptions) -> IDCTPtr {
52    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
53    #[cfg(feature = "x86")]
54    {
55        if options.use_avx2() {
56            debug!("Using vector integer IDCT");
57            return |a: &mut [i32; 64], b: &mut [i16], c: usize| {
58                // SAFETY: `options.use_avx2()` only returns true if avx2 is supported.
59                unsafe { avx2::idct_avx2(a,b,c) }
60            };
61        }
62    }
63    #[cfg(target_arch = "aarch64")]
64    #[cfg(feature = "neon")]
65    {
66        if options.use_neon() {
67            debug!("Using vector integer IDCT");
68            return |a: &mut [i32; 64], b: &mut [i16], c: usize| {
69                // SAFETY: `options.use_neon()` only returns true if neon is supported.
70                unsafe { neon::idct_neon(a,b,c) }
71            };
72        }
73    }
74    debug!("Using scalar integer IDCT");
75    // use generic one
76    return idct_int;
77}
78
79/// Choose a function to implement 4x4 IDCT.
80///
81/// These functions get the same input but have an extra contract: Only the first 4x4 block of
82/// coefficients are non-zero. All other entries are zeroed.
83///
84/// **The callee must uphold that contract on return**
85pub fn choose_idct_4x4_func(_options: &DecoderOptions) -> IDCTPtr {
86    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
87    #[cfg(feature = "x86")]
88    {
89        if _options.use_avx2() {
90            debug!("Using vector integer IDCT");
91            return |a: &mut [i32; 64], b: &mut [i16], c: usize| {
92                // SAFETY: `options.use_avx2()` only returns true if avx2 is supported.
93                unsafe { avx2::idct_avx2_4x4(a,b,c) }
94            };
95        }
96    }
97
98    scalar::idct4x4
99}
100
101pub fn choose_idct_1x1_func(_: &DecoderOptions) -> IDCTPtr {
102    // These are simple stores, no alternative implementation for now
103    idct_int_1x1
104}
105
106#[cfg(test)]
107#[allow(unreachable_code)]
108#[allow(dead_code)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn idct_test0() {
114        let stride = 8;
115        let mut coeff = [10; 64];
116        let mut coeff2 = [10; 64];
117        let mut output_scalar = [0; 64];
118        let mut output_vector = [0; 64];
119        let idct_func = choose_idct_func(&DecoderOptions::new_fast());
120        idct_func(&mut coeff, &mut output_vector, stride);
121        idct_int(&mut coeff2, &mut output_scalar, stride);
122        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
123    }
124
125    #[test]
126    fn do_idct_test1() {
127        let stride = 8;
128        let mut coeff = [14; 64];
129        let mut coeff2 = [14; 64];
130        let mut output_scalar = [0; 64];
131        let mut output_vector = [0; 64];
132        let idct_func = choose_idct_func(&DecoderOptions::new_fast());
133        idct_func(&mut coeff, &mut output_vector, stride);
134        idct_int(&mut coeff2, &mut output_scalar, stride);
135        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
136    }
137
138    #[test]
139    fn do_idct_test2() {
140        let stride = 8;
141        let mut coeff = [0; 64];
142        coeff[0] = 255;
143        coeff[63] = -256;
144        let mut coeff2 = coeff;
145        let mut output_scalar = [0; 64];
146        let mut output_vector = [0; 64];
147        let idct_func = choose_idct_func(&DecoderOptions::new_fast());
148        idct_func(&mut coeff, &mut output_vector, stride);
149        idct_int(&mut coeff2, &mut output_scalar, stride);
150        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
151    }
152
153    #[test]
154    fn do_idct_zeros() {
155        let stride = 8;
156        let mut coeff = [0; 64];
157        let mut coeff2 = [0; 64];
158        let mut output_scalar = [0; 64];
159        let mut output_vector = [0; 64];
160        let idct_func = choose_idct_func(&DecoderOptions::new_fast());
161        idct_func(&mut coeff, &mut output_vector, stride);
162        idct_int(&mut coeff2, &mut output_scalar, stride);
163        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
164    }
165
166    #[test]
167    fn idct_4x4() {
168        #[rustfmt::skip]
169        const A: [i32; 32] = [
170            -254, -7, 0, 0, 0, 0, 0, 0,
171            7, 0, -30, 32,  0, 0, 0, 0,
172            7, 0, -30, 32,  0, 0, 0, 0,
173            7, 0, -30, 32,  0, 0, 0, 0,
174        ];
175
176        let v: Vec<IDCTPtr> = vec![
177            choose_idct_func(&DecoderOptions::new_safe()),
178            choose_idct_4x4_func(&DecoderOptions::new_safe()),
179            choose_idct_func(&DecoderOptions::new_fast()),
180            choose_idct_4x4_func(&DecoderOptions::new_fast()),
181        ];
182        let dct_names = vec![
183            "safe idct",
184            "safe idct 4x4",
185            "fast idct",
186            "fast idct 4x4",
187        ];
188
189        let mut color = vec![];
190
191        for idct in v {
192            let mut a = [0i32; 64];
193            a[..32].copy_from_slice(&A);
194            let mut b = [0i16; 64];
195
196            idct(&mut a, &mut b, 8);
197
198            color.push(b);
199        }
200
201        for (wnd, name) in color.windows(2).zip(&dct_names) {
202            let [a, b] = wnd else { unreachable!() };
203            assert_eq!(a, b, "{name}");
204        }
205    }
206}