png/filter/
paeth.rs

1use crate::BytesPerPixel;
2
3// This code path is used on non-x86_64 architectures but we allow dead code
4// for the test module to be able to access it.
5#[allow(dead_code)]
6pub(super) fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
7    // On ARM this algorithm performs much better than the one above adapted from stb,
8    // and this is the better-studied algorithm we've always used here,
9    // so we default to it on all non-x86 platforms.
10    let pa = (i16::from(b) - i16::from(c)).abs();
11    let pb = (i16::from(a) - i16::from(c)).abs();
12    let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
13
14    let mut out = a;
15    let mut min = pa;
16
17    if pb < min {
18        min = pb;
19        out = b;
20    }
21    if pc < min {
22        out = c;
23    }
24
25    out
26}
27
28pub(super) fn filter_paeth_stbi(a: i16, b: i16, c: i16) -> u8 {
29    // Decoding optimizes better with this algorithm than with `filter_paeth`
30    //
31    // This formulation looks very different from the reference in the PNG spec, but is
32    // actually equivalent and has favorable data dependencies and admits straightforward
33    // generation of branch-free code, which helps performance significantly.
34    //
35    // Adapted from public domain PNG implementation:
36    // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
37    let thresh = c * 3 - (a + b);
38    let lo = a.min(b);
39    let hi = a.max(b);
40    let t0 = if hi <= thresh { lo } else { c };
41    let t1 = if thresh <= lo { hi } else { t0 };
42    t1 as u8
43}
44
45pub(super) fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
46    // This is an optimized version of the paeth filter from the PNG specification, proposed by
47    // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
48    // entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
49    //
50    //     p = a + b - c
51    //     pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c)
52    //     pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c)
53    //     pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ...
54    //
55    // Further optimizing the calculation of `pc` a bit tricker. However, notice that:
56    //
57    //        a > c && b > c
58    //    ==> (a - c) > 0 && (b - c) > 0
59    //    ==> pc > (a - c) && pc > (b - c)
60    //    ==> pc > |a - c| && pc > |b - c|
61    //    ==> pc > pb && pc > pa
62    //
63    // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
64    // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
65    // `c <= b` or vice versa:
66    //
67    //     pc = ||b - c| - |a - c|| =  |pa - pb| = max(pa, pb) - min(pa, pb)
68    //
69    let pa = b.max(c) - c.min(b);
70    let pb = a.max(c) - c.min(a);
71    let pc = if (a < c) == (c < b) {
72        pa.max(pb) - pa.min(pb)
73    } else {
74        255
75    };
76
77    if pa <= pb && pa <= pc {
78        a
79    } else if pb <= pc {
80        b
81    } else {
82        c
83    }
84}
85
86#[allow(unreachable_code)]
87#[cfg(not(target_arch = "x86_64"))]
88pub(super) fn unfilter(tbpp: BytesPerPixel, previous: &[u8], current: &mut [u8]) {
89    // Paeth filter pixels:
90    // C B D
91    // A X
92    match tbpp {
93        BytesPerPixel::One => {
94            let mut a_bpp = [0; 1];
95            let mut c_bpp = [0; 1];
96            for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) {
97                let new_chunk = [chunk[0].wrapping_add(filter_paeth(a_bpp[0], b_bpp[0], c_bpp[0]))];
98                *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
99                a_bpp = new_chunk;
100                c_bpp = b_bpp.try_into().unwrap();
101            }
102        }
103        BytesPerPixel::Two => {
104            let mut a_bpp = [0; 2];
105            let mut c_bpp = [0; 2];
106            for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) {
107                let new_chunk = [
108                    chunk[0].wrapping_add(filter_paeth(a_bpp[0], b_bpp[0], c_bpp[0])),
109                    chunk[1].wrapping_add(filter_paeth(a_bpp[1], b_bpp[1], c_bpp[1])),
110                ];
111                *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
112                a_bpp = new_chunk;
113                c_bpp = b_bpp.try_into().unwrap();
114            }
115        }
116        BytesPerPixel::Three => {
117            #[cfg(all(feature = "unstable", not(target_vendor = "apple")))]
118            {
119                // Results in PR: https://github.com/image-rs/image-png/pull/632
120                // Approximately 30% better on Arm Cortex A520, 7%
121                // regression on Arm Cortex X4. Switched off on Apple
122                // Silicon due to 10-12% regression.
123                super::simd::paeth_unfilter_3bpp(current, previous);
124                return;
125            }
126            let mut a_bpp = [0; 3];
127            let mut c_bpp = [0; 3];
128
129            let mut previous = &previous[..previous.len() / 3 * 3];
130            let current_len = current.len();
131            let mut current = &mut current[..current_len / 3 * 3];
132
133            while let ([c0, c1, c2, c_rest @ ..], [p0, p1, p2, p_rest @ ..]) = (current, previous) {
134                current = c_rest;
135                previous = p_rest;
136
137                *c0 = c0.wrapping_add(filter_paeth(a_bpp[0], *p0, c_bpp[0]));
138                *c1 = c1.wrapping_add(filter_paeth(a_bpp[1], *p1, c_bpp[1]));
139                *c2 = c2.wrapping_add(filter_paeth(a_bpp[2], *p2, c_bpp[2]));
140
141                a_bpp = [*c0, *c1, *c2];
142                c_bpp = [*p0, *p1, *p2];
143            }
144        }
145        BytesPerPixel::Four => {
146            #[cfg(feature = "unstable")]
147            {
148                // Results in PR: https://github.com/image-rs/image-png/pull/633
149                // No change on Apple Silicon, 42% better on Arm Cortex A520,
150                // 10% better on Arm Cortex X4.
151                super::simd::paeth_unfilter_4bpp(current, previous);
152                return;
153            }
154
155            let mut a_bpp = [0; 4];
156            let mut c_bpp = [0; 4];
157
158            let mut previous = &previous[..previous.len() & !3];
159            let current_len = current.len();
160            let mut current = &mut current[..current_len & !3];
161
162            while let ([c0, c1, c2, c3, c_rest @ ..], [p0, p1, p2, p3, p_rest @ ..]) =
163                (current, previous)
164            {
165                current = c_rest;
166                previous = p_rest;
167
168                *c0 = c0.wrapping_add(filter_paeth(a_bpp[0], *p0, c_bpp[0]));
169                *c1 = c1.wrapping_add(filter_paeth(a_bpp[1], *p1, c_bpp[1]));
170                *c2 = c2.wrapping_add(filter_paeth(a_bpp[2], *p2, c_bpp[2]));
171                *c3 = c3.wrapping_add(filter_paeth(a_bpp[3], *p3, c_bpp[3]));
172
173                a_bpp = [*c0, *c1, *c2, *c3];
174                c_bpp = [*p0, *p1, *p2, *p3];
175            }
176        }
177        BytesPerPixel::Six => {
178            let mut a_bpp = [0; 6];
179            let mut c_bpp = [0; 6];
180            for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) {
181                let new_chunk = [
182                    chunk[0].wrapping_add(filter_paeth(a_bpp[0], b_bpp[0], c_bpp[0])),
183                    chunk[1].wrapping_add(filter_paeth(a_bpp[1], b_bpp[1], c_bpp[1])),
184                    chunk[2].wrapping_add(filter_paeth(a_bpp[2], b_bpp[2], c_bpp[2])),
185                    chunk[3].wrapping_add(filter_paeth(a_bpp[3], b_bpp[3], c_bpp[3])),
186                    chunk[4].wrapping_add(filter_paeth(a_bpp[4], b_bpp[4], c_bpp[4])),
187                    chunk[5].wrapping_add(filter_paeth(a_bpp[5], b_bpp[5], c_bpp[5])),
188                ];
189                *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
190                a_bpp = new_chunk;
191                c_bpp = b_bpp.try_into().unwrap();
192            }
193        }
194        BytesPerPixel::Eight => {
195            let mut a_bpp = [0; 8];
196            let mut c_bpp = [0; 8];
197            for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) {
198                let new_chunk = [
199                    chunk[0].wrapping_add(filter_paeth(a_bpp[0], b_bpp[0], c_bpp[0])),
200                    chunk[1].wrapping_add(filter_paeth(a_bpp[1], b_bpp[1], c_bpp[1])),
201                    chunk[2].wrapping_add(filter_paeth(a_bpp[2], b_bpp[2], c_bpp[2])),
202                    chunk[3].wrapping_add(filter_paeth(a_bpp[3], b_bpp[3], c_bpp[3])),
203                    chunk[4].wrapping_add(filter_paeth(a_bpp[4], b_bpp[4], c_bpp[4])),
204                    chunk[5].wrapping_add(filter_paeth(a_bpp[5], b_bpp[5], c_bpp[5])),
205                    chunk[6].wrapping_add(filter_paeth(a_bpp[6], b_bpp[6], c_bpp[6])),
206                    chunk[7].wrapping_add(filter_paeth(a_bpp[7], b_bpp[7], c_bpp[7])),
207                ];
208                *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
209                a_bpp = new_chunk;
210                c_bpp = b_bpp.try_into().unwrap();
211            }
212        }
213    }
214}
215
216/// The x86_64 functions avoid casting between u8xN and i16xN SIMD
217/// representations when possible by maintaining [i16; BPP] arrays
218/// between iterations instead of [u8; BPP].
219#[allow(unreachable_code)]
220#[cfg(target_arch = "x86_64")]
221pub(super) fn unfilter(tbpp: BytesPerPixel, previous: &[u8], current: &mut [u8]) {
222    // Paeth filter pixels:
223    // C B D
224    // A X
225    match tbpp {
226        BytesPerPixel::One => {
227            const BPP: usize = 1;
228            let mut a_bpp = [0; BPP];
229            let mut c_bpp = [0; BPP];
230
231            for (c, p) in current
232                .chunks_exact_mut(BPP)
233                .zip(previous.chunks_exact(BPP))
234            {
235                for i in 0..BPP {
236                    c[i] = c[i].wrapping_add(filter_paeth_stbi(a_bpp[i], p[i] as i16, c_bpp[i]));
237                }
238
239                a_bpp = [c[0] as i16];
240                c_bpp = [p[0] as i16];
241            }
242        }
243        BytesPerPixel::Two => {
244            const BPP: usize = 2;
245            let mut a_bpp = [0; BPP];
246            let mut c_bpp = [0; BPP];
247
248            for (c, p) in current
249                .chunks_exact_mut(BPP)
250                .zip(previous.chunks_exact(BPP))
251            {
252                for i in 0..BPP {
253                    c[i] = c[i].wrapping_add(filter_paeth_stbi(a_bpp[i], p[i] as i16, c_bpp[i]));
254                }
255
256                a_bpp = [c[0] as i16, c[1] as i16];
257                c_bpp = [p[0] as i16, p[1] as i16];
258            }
259        }
260        BytesPerPixel::Three => {
261            #[cfg(feature = "unstable")]
262            {
263                // Results in PR: https://github.com/image-rs/image-png/pull/632
264                // 23% better on an Epyc 7B13, 10% on a Zen 3 part.
265                // ~30% when targeting x86-64-v2.
266                super::simd::paeth_unfilter_3bpp(current, previous);
267                return;
268            }
269            const BPP: usize = 3;
270            let mut a_bpp = [0; BPP];
271            let mut c_bpp = [0; BPP];
272
273            for (c, p) in current
274                .chunks_exact_mut(BPP)
275                .zip(previous.chunks_exact(BPP))
276            {
277                for i in 0..BPP {
278                    c[i] = c[i].wrapping_add(filter_paeth_stbi(a_bpp[i], p[i] as i16, c_bpp[i]));
279                }
280
281                a_bpp = [c[0] as i16, c[1] as i16, c[2] as i16];
282                c_bpp = [p[0] as i16, p[1] as i16, p[2] as i16];
283            }
284        }
285        BytesPerPixel::Four => {
286            #[cfg(feature = "unstable")]
287            {
288                // Results in PR: https://github.com/image-rs/image-png/pull/633
289                // May be slightly faster on AMD EPYC 7B13.
290                super::simd::paeth_unfilter_4bpp(current, previous);
291                return;
292            }
293            const BPP: usize = 4;
294            let mut a_bpp = [0; BPP];
295            let mut c_bpp = [0; BPP];
296
297            for (c, p) in current
298                .chunks_exact_mut(BPP)
299                .zip(previous.chunks_exact(BPP))
300            {
301                for i in 0..BPP {
302                    c[i] = c[i].wrapping_add(filter_paeth_stbi(a_bpp[i], p[i] as i16, c_bpp[i]));
303                }
304
305                a_bpp = [c[0] as i16, c[1] as i16, c[2] as i16, c[3] as i16];
306                c_bpp = [p[0] as i16, p[1] as i16, p[2] as i16, p[3] as i16];
307            }
308        }
309        BytesPerPixel::Six => {
310            const BPP: usize = 6;
311            let mut a_bpp = [0; BPP];
312            let mut c_bpp = [0; BPP];
313
314            for (c, p) in current
315                .chunks_exact_mut(BPP)
316                .zip(previous.chunks_exact(BPP))
317            {
318                for i in 0..BPP {
319                    c[i] = c[i].wrapping_add(filter_paeth_stbi(a_bpp[i], p[i] as i16, c_bpp[i]));
320                }
321
322                a_bpp = [
323                    c[0] as i16,
324                    c[1] as i16,
325                    c[2] as i16,
326                    c[3] as i16,
327                    c[4] as i16,
328                    c[5] as i16,
329                ];
330                c_bpp = [
331                    p[0] as i16,
332                    p[1] as i16,
333                    p[2] as i16,
334                    p[3] as i16,
335                    p[4] as i16,
336                    p[5] as i16,
337                ];
338            }
339        }
340        BytesPerPixel::Eight => {
341            const BPP: usize = 8;
342            let mut a_bpp = [0; BPP];
343            let mut c_bpp = [0; BPP];
344
345            for (c, p) in current
346                .chunks_exact_mut(BPP)
347                .zip(previous.chunks_exact(BPP))
348            {
349                for i in 0..BPP {
350                    c[i] = c[i].wrapping_add(filter_paeth_stbi(a_bpp[i], p[i] as i16, c_bpp[i]));
351                }
352
353                a_bpp = [
354                    c[0] as i16,
355                    c[1] as i16,
356                    c[2] as i16,
357                    c[3] as i16,
358                    c[4] as i16,
359                    c[5] as i16,
360                    c[6] as i16,
361                    c[7] as i16,
362                ];
363                c_bpp = [
364                    p[0] as i16,
365                    p[1] as i16,
366                    p[2] as i16,
367                    p[3] as i16,
368                    p[4] as i16,
369                    p[5] as i16,
370                    p[6] as i16,
371                    p[7] as i16,
372                ];
373            }
374        }
375    }
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    #[test]
383    #[ignore] // takes ~20s without optimizations
384    fn paeth_impls_are_equivalent() {
385        for a in 0..=255 {
386            for b in 0..=255 {
387                for c in 0..=255 {
388                    let baseline = filter_paeth(a, b, c);
389                    let fpnge = filter_paeth_fpnge(a, b, c);
390                    let stbi = filter_paeth_stbi(a as i16, b as i16, c as i16);
391
392                    assert_eq!(baseline, fpnge);
393                    assert_eq!(baseline, stbi);
394                }
395            }
396        }
397    }
398}
png/filter/paeth.rs

png/filter/
paeth.rs