zune_jpeg/color_convert/
avx.rs

1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! AVX color conversion routines
10//!
11//! Okay these codes are cool
12//!
13//! Herein lies super optimized codes to do color conversions.
14//!
15//!
16//! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent.
17//! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding
18//! (also libjpeg uses routines like `Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G`)
19//!
20//! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with
21//! floating points.., fun fact: the first fun fact wasn't even fun.)
22//!
23//! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we
24//! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes
25//! it out to something cool).
26//!
27//! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for.
28//!
29//! O and ~~subscribe to my youtube channel~~
30
31#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32#![cfg(feature = "x86")]
33#![allow(
34    clippy::wildcard_imports,
35    clippy::cast_possible_truncation,
36    clippy::too_many_arguments,
37    clippy::inline_always,
38    clippy::doc_markdown,
39    dead_code
40)]
41
42#[cfg(target_arch = "x86")]
43use core::arch::x86::*;
44#[cfg(target_arch = "x86_64")]
45use core::arch::x86_64::*;
46
47use crate::color_convert::scalar::{CB_CF, CR_CF, C_G_CB_COEF_2, C_G_CR_COEF_1, YUV_RND, Y_CF};
48
49pub union YmmRegister {
50    // both are 32 when using std::mem::size_of
51    mm256: __m256i,
52    // for avx color conversion
53    array: [i16; 16]
54}
55
56const R_AVX_COEF: i32 = i32::from_ne_bytes([CR_CF.to_ne_bytes()[0], CR_CF.to_ne_bytes()[1], 0, 0]);
57const B_AVX_COEF: i32 = i32::from_ne_bytes([0, 0, CB_CF.to_ne_bytes()[0], CB_CF.to_ne_bytes()[1]]);
58const G_COEF_AVX_COEF: i32 = i32::from_ne_bytes([
59    C_G_CR_COEF_1.to_ne_bytes()[0],
60    C_G_CR_COEF_1.to_ne_bytes()[1],
61    C_G_CB_COEF_2.to_ne_bytes()[0],
62    C_G_CB_COEF_2.to_ne_bytes()[1]
63]);
64
65//--------------------------------------------------------------------------------------------------
66// AVX conversion routines
67//--------------------------------------------------------------------------------------------------
68
69///
70/// Convert YCBCR to RGB using AVX instructions
71///
72///  # Note
73///**IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING
74/// AVX2 OTHERWISE THIS IS UB**
75///
76/// *Peace*
77///
78/// This library itself will ensure that it's never called in CPU's not
79/// supporting AVX2
80///
81/// # Arguments
82/// - `y`,`cb`,`cr`: A reference of 8 i32's
83/// - `out`: The output  array where we store our converted items
84/// - `offset`: The position from 0 where we write these RGB values
85#[inline(always)]
86pub fn ycbcr_to_rgb_avx2(
87    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
88) {
89    // call this in another function to tell RUST to vectorize this
90    // storing
91    unsafe {
92        ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
93    }
94}
95
96#[inline]
97#[target_feature(enable = "avx2")]
98unsafe fn ycbcr_to_rgb_avx2_1(
99    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
100) {
101    let (mut r, mut g, mut b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
102
103    r = _mm256_packus_epi16(r, _mm256_setzero_si256());
104    g = _mm256_packus_epi16(g, _mm256_setzero_si256());
105    b = _mm256_packus_epi16(b, _mm256_setzero_si256());
106
107    r = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(r);
108    g = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(g);
109    b = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(b);
110
111    let sh_r = _mm256_setr_epi8(
112        0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
113        9, 4, 15, 10, 5
114    );
115    let sh_g = _mm256_setr_epi8(
116        5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
117        14, 9, 4, 15, 10
118    );
119    let sh_b = _mm256_setr_epi8(
120        10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
121        3, 14, 9, 4, 15
122    );
123
124    let r0 = _mm256_shuffle_epi8(r, sh_r);
125    let g0 = _mm256_shuffle_epi8(g, sh_g);
126    let b0 = _mm256_shuffle_epi8(b, sh_b);
127
128    let m0 = _mm256_setr_epi8(
129        0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
130        0, 0, -1, 0, 0
131    );
132    let m1 = _mm256_setr_epi8(
133        0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
134        -1, 0, 0, -1, 0
135    );
136
137    let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
138    let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
139    let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
140
141    let rgb0 = _mm256_permute2x128_si256::<32>(p0, p1);
142    let rgb1 = _mm256_permute2x128_si256::<48>(p2, p0);
143
144    _mm256_storeu_si256(out.as_mut_ptr().cast(), rgb0);
145    _mm_storeu_si128(out[32..].as_mut_ptr().cast(), _mm256_castsi256_si128(rgb1));
146
147    *offset += 48;
148}
149
150// Enabled avx2 automatically enables avx.
151#[inline]
152#[target_feature(enable = "avx2")]
153/// A baseline implementation of YCbCr to RGB conversion which does not carry
154/// out clamping
155///
156/// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion
157/// routines
158unsafe fn ycbcr_to_rgb_baseline_no_clamp(
159    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
160) -> (__m256i, __m256i, __m256i) {
161    // Load values into a register
162    //
163    let y_c = _mm256_loadu_si256(y.as_ptr().cast());
164    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
165    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
166
167    // Here we want to use _mm256_madd_epi16 to perform 2 multiplications
168    // and one addition per instruction.
169
170    // At first, we have to pack i16 U and V that stores u8 into one u8 [U,V]
171    // then zero extend, and keep in mind that lanes is already been permuted.
172
173    let y_coeff = _mm256_set1_epi32(i32::from(Y_CF));
174    let cr_coeff = _mm256_set1_epi32(R_AVX_COEF);
175    let cb_coeff = _mm256_set1_epi32(B_AVX_COEF);
176    let cg_coeff = _mm256_set1_epi32(G_COEF_AVX_COEF);
177    let v_rnd = _mm256_set1_epi32(i32::from(YUV_RND));
178    let uv_bias = _mm256_set1_epi16(128);
179
180    // UV in memory because x86/x86_64 is always little endian
181    let v_0 = _mm256_slli_epi16::<8>(cb_c);
182    let u_v_8 = _mm256_or_si256(v_0, cr_c);
183
184    let mut u_v_lo = _mm256_unpacklo_epi8(u_v_8, _mm256_setzero_si256());
185    let mut u_v_hi = _mm256_unpackhi_epi8(u_v_8, _mm256_setzero_si256());
186
187    let mut y_lo = _mm256_unpacklo_epi16(y_c, _mm256_setzero_si256());
188    let mut y_hi = _mm256_unpackhi_epi16(y_c, _mm256_setzero_si256());
189
190    u_v_lo = _mm256_sub_epi16(u_v_lo, uv_bias);
191    u_v_hi = _mm256_sub_epi16(u_v_hi, uv_bias);
192
193    y_lo = _mm256_madd_epi16(y_lo, y_coeff);
194    y_hi = _mm256_madd_epi16(y_hi, y_coeff);
195
196    let mut r_lo = _mm256_madd_epi16(u_v_lo, cr_coeff);
197    let mut r_hi = _mm256_madd_epi16(u_v_hi, cr_coeff);
198
199    let mut g_lo = _mm256_madd_epi16(u_v_lo, cg_coeff);
200    let mut g_hi = _mm256_madd_epi16(u_v_hi, cg_coeff);
201
202    // This ordering is preferred to reduce register file pressure.
203
204    y_lo = _mm256_add_epi32(y_lo, v_rnd);
205    y_hi = _mm256_add_epi32(y_hi, v_rnd);
206
207    let mut b_lo = _mm256_madd_epi16(u_v_lo, cb_coeff);
208    let mut b_hi = _mm256_madd_epi16(u_v_hi, cb_coeff);
209
210    r_lo = _mm256_add_epi32(r_lo, y_lo);
211    r_hi = _mm256_add_epi32(r_hi, y_hi);
212
213    g_lo = _mm256_add_epi32(g_lo, y_lo);
214    g_hi = _mm256_add_epi32(g_hi, y_hi);
215
216    b_lo = _mm256_add_epi32(b_lo, y_lo);
217    b_hi = _mm256_add_epi32(b_hi, y_hi);
218
219    r_lo = _mm256_srai_epi32::<14>(r_lo);
220    r_hi = _mm256_srai_epi32::<14>(r_hi);
221
222    g_lo = _mm256_srai_epi32::<14>(g_lo);
223    g_hi = _mm256_srai_epi32::<14>(g_hi);
224
225    b_lo = _mm256_srai_epi32::<14>(b_lo);
226    b_hi = _mm256_srai_epi32::<14>(b_hi);
227
228    let r = _mm256_packus_epi32(r_lo, r_hi);
229    let g = _mm256_packus_epi32(g_lo, g_hi);
230    let b = _mm256_packus_epi32(b_lo, b_hi);
231
232    return (r, g, b);
233}
234
235#[inline(always)]
236pub fn ycbcr_to_rgba_avx2(
237    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
238) {
239    unsafe {
240        ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
241    }
242}
243
244#[inline]
245#[target_feature(enable = "avx2")]
246#[rustfmt::skip]
247unsafe fn ycbcr_to_rgba_unsafe(
248    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
249    out: &mut [u8],
250    offset: &mut usize,
251)
252{
253    // check if we have enough space to write.
254    let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
255
256    let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
257
258    // set alpha channel to 255 for opaque
259
260    // And no these comments were not from me pressing the keyboard
261
262    // Pack the integers into u8's using unsigned saturation.
263    let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb
264    let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); // cccccc_dddddd_ccccccc_ddddd
265    // transpose_u16 and interleave channels
266    let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab
267    let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd
268    // final transpose_u16
269    let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd
270    let h = _mm256_unpackhi_epi8(e, f);
271    
272    // undo packus shuffling...
273    let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
274    
275    let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
276    
277    let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
278    
279    let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
280    
281    let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
282    
283    let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
284    
285    // Store
286    // Use streaming instructions to prevent polluting the cache?
287    _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
288    
289    _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
290
291    *offset += 64;
292}
293
294#[inline]
295const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
296    (z << 6) | (y << 4) | (x << 2) | w
297}