Skip to main content

zune_jpeg/idct/
scalar.rs

1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! Platform independent IDCT algorithm
10//!
11//! Not as fast as AVX one.
12
13const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
14
15#[inline(always)]
16fn wa(a: i32, b: i32) -> i32 {
17    a.wrapping_add(b)
18}
19
20#[inline(always)]
21fn ws(a: i32, b: i32) -> i32 {
22    a.wrapping_sub(b)
23}
24
25#[inline(always)]
26fn wm(a: i32, b: i32) -> i32 {
27    a.wrapping_mul(b)
28}
29
30#[inline]
31pub fn idct_int_1x1(in_vector: &mut [i32; 64], mut out_vector: &mut [i16], stride: usize) {
32    let coeff = ((wa(wa(in_vector[0], 4), 1024) >> 3).clamp(0, 255)) as i16;
33
34    out_vector[..8].fill(coeff);
35    for _ in 0..7 {
36        out_vector = &mut out_vector[stride..];
37        out_vector[..8].fill(coeff);
38
39    }
40}
41
42#[allow(unused_assignments)]
43#[allow(
44    clippy::too_many_lines,
45    clippy::op_ref,
46    clippy::cast_possible_truncation
47)]
48pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
49    let mut pos = 0;
50    let mut i = 0;
51
52    if &in_vector[1..] == &[0_i32; 63] {
53        return idct_int_1x1(in_vector, out_vector, stride);
54    }
55
56    // vertical pass
57    for ptr in 0..8 {
58        let p2 = in_vector[ptr + 16];
59        let p3 = in_vector[ptr + 48];
60
61        let p1 = wm(wa(p2, p3), 2217);
62
63        let t2 = wa(p1, wm(p3, -7567));
64        let t3 = wa(p1, wm(p2, 3135));
65
66        let p2 = in_vector[ptr];
67        let p3 = in_vector[32 + ptr];
68
69        let t0 = fsh(wa(p2, p3));
70        let t1 = fsh(ws(p2, p3));
71
72        let x0 = wa(wa(t0, t3), 512);
73        let x3 = wa(ws(t0, t3), 512);
74        let x1 = wa(wa(t1, t2), 512);
75        let x2 = wa(ws(t1, t2), 512);
76
77        let mut t0 = in_vector[ptr + 56];
78        let mut t1 = in_vector[ptr + 40];
79        let mut t2 = in_vector[ptr + 24];
80        let mut t3 = in_vector[ptr + 8];
81
82        let p3 = wa(t0, t2);
83        let p4 = wa(t1, t3);
84        let p1 = wa(t0, t3);
85        let p2 = wa(t1, t2);
86        let p5 = wm(wa(p3, p4), 4816);
87
88        t0 = wm(t0, 1223);
89        t1 = wm(t1, 8410);
90        t2 = wm(t2, 12586);
91        t3 = wm(t3, 6149);
92
93        let p1 = wa(p5, wm(p1, -3685));
94        let p2 = wa(p5, wm(p2, -10497));
95        let p3 = wm(p3, -8034);
96        let p4 = wm(p4, -1597);
97
98        t3 = wa(t3, wa(p1, p4));
99        t2 = wa(t2, wa(p2, p3));
100        t1 = wa(t1, wa(p2, p4));
101        t0 = wa(t0, wa(p1, p3));
102
103        in_vector[ptr] = ws(wa(x0, t3), 0) >> 10;
104        in_vector[ptr + 8] = ws(wa(x1, t2), 0) >> 10;
105        in_vector[ptr + 16] = ws(wa(x2, t1), 0) >> 10;
106        in_vector[ptr + 24] = ws(wa(x3, t0), 0) >> 10;
107        in_vector[ptr + 32] = ws(ws(x3, t0), 0) >> 10;
108        in_vector[ptr + 40] = ws(ws(x2, t1), 0) >> 10;
109        in_vector[ptr + 48] = ws(ws(x1, t2), 0) >> 10;
110        in_vector[ptr + 56] = ws(ws(x0, t3), 0) >> 10;
111    }
112
113    // horizontal pass
114    while i < 64 {
115        let p2 = in_vector[i + 2];
116        let p3 = in_vector[i + 6];
117
118        let p1 = wm(wa(p2, p3), 2217);
119        let t2 = wa(p1, wm(p3, -7567));
120        let t3 = wa(p1, wm(p2, 3135));
121
122        let p2 = in_vector[i];
123        let p3 = in_vector[i + 4];
124
125        let t0 = fsh(wa(p2, p3));
126        let t1 = fsh(ws(p2, p3));
127
128        let x0 = wa(wa(t0, t3), SCALE_BITS);
129        let x3 = wa(ws(t0, t3), SCALE_BITS);
130        let x1 = wa(wa(t1, t2), SCALE_BITS);
131        let x2 = wa(ws(t1, t2), SCALE_BITS);
132
133        let mut t0 = in_vector[i + 7];
134        let mut t1 = in_vector[i + 5];
135        let mut t2 = in_vector[i + 3];
136        let mut t3 = in_vector[i + 1];
137
138        let p3 = wa(t0, t2);
139        let p4 = wa(t1, t3);
140        let p1 = wa(t0, t3);
141        let p2 = wa(t1, t2);
142        let p5 = wm(wa(p3, p4), f2f(1.175875602));
143
144        t0 = wm(t0, 1223);
145        t1 = wm(t1, 8410);
146        t2 = wm(t2, 12586);
147        t3 = wm(t3, 6149);
148
149        let p1 = wa(p5, wm(p1, -3685));
150        let p2 = wa(p5, wm(p2, -10497));
151        let p3 = wm(p3, -8034);
152        let p4 = wm(p4, -1597);
153
154        t3 = wa(t3, wa(p1, p4));
155        t2 = wa(t2, wa(p2, p3));
156        t1 = wa(t1, wa(p2, p4));
157        t0 = wa(t0, wa(p1, p3));
158
159        // to prevent some bad images from crashing
160        let mut tmp = [0; 8];
161
162        let out: &mut [i16; 8] = out_vector
163            .get_mut(pos..pos + 8)
164            .unwrap_or(&mut tmp)
165            .try_into()
166            .unwrap();
167
168        out[0] = clamp(wa(x0, t3) >> 17);
169        out[1] = clamp(wa(x1, t2) >> 17);
170        out[2] = clamp(wa(x2, t1) >> 17);
171        out[3] = clamp(wa(x3, t0) >> 17);
172        out[4] = clamp(ws(x3, t0) >> 17);
173        out[5] = clamp(ws(x2, t1) >> 17);
174        out[6] = clamp(ws(x1, t2) >> 17);
175        out[7] = clamp(ws(x0, t3) >> 17);
176
177        i += 8;
178        pos += stride;
179    }
180}
181
182#[inline]
183#[allow(clippy::cast_possible_truncation)]
184/// Multiply a number by 4096
185fn f2f(x: f32) -> i32 {
186    (x * 4096.0 + 0.5) as i32
187}
188
189#[inline]
190/// Multiply a number by 4096
191fn fsh(x: i32) -> i32 {
192    x << 12
193}
194
195/// Clamp values between 0 and 255
196#[inline]
197#[allow(clippy::cast_possible_truncation)]
198fn clamp(a: i32) -> i16 {
199    a.clamp(0, 255) as i16
200}
201
202/// IDCT assuming only the upper 4x4 is filled.
203pub fn idct4x4(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
204    let mut pos = 0;
205
206    // vertical pass
207    for ptr in 0..4 {
208        let i0 = wa(fsh(in_vector[ptr]), 512);
209        let i2 = in_vector[ptr + 16];
210
211        let p1 = wm(i2, 2217);
212        let p3 = wm(i2, 5352);
213
214        let x0 = wa(i0, p3);
215        let x1 = wa(i0, p1);
216        let x2 = ws(i0, p1);
217        let x3 = ws(i0, p3);
218
219        // odd part
220        let i4 = in_vector[ptr + 24];
221        let i3 = in_vector[ptr + 8];
222
223        let p5 = wm(wa(i4, i3), 4816);
224
225        let p1 = wa(p5, wm(i3, -3685));
226        let p2 = wa(p5, wm(i4, -10497));
227
228        let t3 = wa(p5, wm(i3, 867));
229        let t2 = wa(p5, wm(i4, -5945));
230
231        let t1 = wa(p2, wm(i3, -1597));
232        let t0 = wa(p1, wm(i4, -8034));
233
234        in_vector[ptr] = wa(x0, t3) >> 10;
235        in_vector[ptr + 8] = wa(x1, t2) >> 10;
236        in_vector[ptr + 16] = wa(x2, t1) >> 10;
237        in_vector[ptr + 24] = wa(x3, t0) >> 10;
238        in_vector[ptr + 32] = ws(x3, t0) >> 10;
239        in_vector[ptr + 40] = ws(x2, t1) >> 10;
240        in_vector[ptr + 48] = ws(x1, t2) >> 10;
241        in_vector[ptr + 56] = ws(x0, t3) >> 10;
242    }
243
244    // horizontal pass
245    for i in (0..8).map(|i| 8 * i) {
246        let i2 = in_vector[i + 2];
247        let i0 = in_vector[i];
248
249        let t0 = wa(fsh(i0), SCALE_BITS);
250        let t2 = wm(i2, 2217);
251        let t3 = wm(i2, 5352);
252
253        let x0 = wa(t0, t3);
254        let x3 = ws(t0, t3);
255        let x1 = wa(t0, t2);
256        let x2 = ws(t0, t2);
257
258        // odd part
259        let i3 = in_vector[i + 3];
260        let i1 = in_vector[i + 1];
261
262        let p5 = wm(wa(i3, i1), f2f(1.175875602));
263
264        let p1 = wa(p5, wm(i1, -3685));
265        let p2 = wa(p5, wm(i3, -10497));
266
267        let t3 = wa(p5, wm(i1, 867));
268        let t2 = wa(p5, wm(i3, -5945));
269
270        let t1 = wa(p2, wm(i1, -1597));
271        let t0 = wa(p1, wm(i3, -8034));
272
273        // to prevent some bad images from crashing
274        let mut tmp = [0; 8];
275
276        let out: &mut [i16; 8] = out_vector
277            .get_mut(pos..pos + 8)
278            .unwrap_or(&mut tmp)
279            .try_into()
280            .unwrap();
281
282        out.copy_from_slice(&[
283            clamp(wa(x0, t3) >> 17),
284            clamp(wa(x1, t2) >> 17),
285            clamp(wa(x2, t1) >> 17),
286            clamp(wa(x3, t0) >> 17),
287            clamp(ws(x3, t0) >> 17),
288            clamp(ws(x2, t1) >> 17),
289            clamp(ws(x1, t2) >> 17),
290            clamp(ws(x0, t3) >> 17)
291        ]);
292
293        pos += stride;
294    }
295
296    in_vector[32..36].fill(0);
297    in_vector[40..44].fill(0);
298    in_vector[48..52].fill(0);
299    in_vector[56..60].fill(0);
300}