Skip to main content

moxcms/conversions/sse/
lut4_to_3.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29#![cfg(feature = "sse_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::interpolator::BarycentricWeight;
32use crate::conversions::lut_transforms::Lut4x3Factory;
33use crate::conversions::sse::assert_barycentric_lut_size_precondition;
34use crate::conversions::sse::interpolator::*;
35use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
36use crate::conversions::sse::lut4_to_3_q0_15::TransformLut4To3SseQ0_15;
37use crate::transform::PointeeSizeExpressible;
38use crate::{
39    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
40    TransformExecutor, TransformOptions,
41};
42use num_traits::AsPrimitive;
43#[cfg(target_arch = "x86")]
44use std::arch::x86::*;
45#[cfg(target_arch = "x86_64")]
46use std::arch::x86_64::*;
47use std::marker::PhantomData;
48use std::sync::Arc;
49
50struct TransformLut4To3Sse<
51    T,
52    U,
53    const LAYOUT: u8,
54    const GRID_SIZE: usize,
55    const BIT_DEPTH: usize,
56    const BINS: usize,
57    const BARYCENTRIC_BINS: usize,
58> {
59    lut: Vec<SseAlignedF32>,
60    _phantom: PhantomData<T>,
61    _phantom1: PhantomData<U>,
62    interpolation_method: InterpolationMethod,
63    weights: Box<[BarycentricWeight<f32>; BINS]>,
64    color_space: DataColorSpace,
65    is_linear: bool,
66}
67
68impl<
69    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
70    U: AsPrimitive<usize>,
71    const LAYOUT: u8,
72    const GRID_SIZE: usize,
73    const BIT_DEPTH: usize,
74    const BINS: usize,
75    const BARYCENTRIC_BINS: usize,
76> TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
77where
78    f32: AsPrimitive<T>,
79    u32: AsPrimitive<T>,
80    (): LutBarycentricReduction<T, U>,
81{
82    #[allow(unused_unsafe)]
83    #[target_feature(enable = "sse4.1")]
84    unsafe fn transform_chunk(
85        &self,
86        src: &[T],
87        dst: &mut [T],
88        interpolator: Box<dyn SseMdInterpolation + Send + Sync>,
89    ) {
90        let cn = Layout::from(LAYOUT);
91        let channels = cn.channels();
92        let grid_size = GRID_SIZE as i32;
93        let grid_size3 = grid_size * grid_size * grid_size;
94
95        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
96        let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
97
98        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
99            let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
100                src[0],
101            );
102            let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103                src[1],
104            );
105            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106                src[2],
107            );
108            let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
109                src[3],
110            );
111
112            let k_weights = self.weights[k.as_()];
113
114            let w: i32 = k_weights.x;
115            let w_n: i32 = k_weights.x_n;
116            let t: f32 = k_weights.w;
117
118            let table1 = &self.lut[(w * grid_size3) as usize..];
119            let table2 = &self.lut[(w_n * grid_size3) as usize..];
120
121            let a0 = interpolator
122                .inter3_sse(table1, c.as_(), m.as_(), y.as_(), self.weights.as_slice())
123                .v;
124            let b0 = interpolator
125                .inter3_sse(table2, c.as_(), m.as_(), y.as_(), self.weights.as_slice())
126                .v;
127
128            if T::FINITE {
129                unsafe {
130                    let t0 = _mm_set1_ps(t);
131                    let ones = _mm_set1_ps(1f32);
132                    let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
133                    let mut v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
134                    v = _mm_max_ps(v, _mm_setzero_ps());
135                    v = _mm_mul_ps(v, value_scale);
136                    v = _mm_min_ps(v, value_scale);
137                    let jvz = _mm_cvtps_epi32(v);
138
139                    let x = _mm_extract_epi32::<0>(jvz);
140                    let y = _mm_extract_epi32::<1>(jvz);
141                    let z = _mm_extract_epi32::<2>(jvz);
142
143                    dst[cn.r_i()] = (x as u32).as_();
144                    dst[cn.g_i()] = (y as u32).as_();
145                    dst[cn.b_i()] = (z as u32).as_();
146                }
147            } else {
148                unsafe {
149                    let t0 = _mm_set1_ps(t);
150                    let ones = _mm_set1_ps(1f32);
151                    let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
152                    let v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
153
154                    dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
155                    dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
156                    dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
157                }
158            }
159            if channels == 4 {
160                dst[cn.a_i()] = max_value;
161            }
162        }
163    }
164}
165
166impl<
167    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
168    U: AsPrimitive<usize>,
169    const LAYOUT: u8,
170    const GRID_SIZE: usize,
171    const BIT_DEPTH: usize,
172    const BINS: usize,
173    const BARYCENTRIC_BINS: usize,
174> TransformExecutor<T>
175    for TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
176where
177    f32: AsPrimitive<T>,
178    u32: AsPrimitive<T>,
179    (): LutBarycentricReduction<T, U>,
180{
181    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
182        let cn = Layout::from(LAYOUT);
183        let channels = cn.channels();
184        if src.len() % 4 != 0 {
185            return Err(CmsError::LaneMultipleOfChannels);
186        }
187        if dst.len() % channels != 0 {
188            return Err(CmsError::LaneMultipleOfChannels);
189        }
190        let src_chunks = src.len() / 4;
191        let dst_chunks = dst.len() / channels;
192        if src_chunks != dst_chunks {
193            return Err(CmsError::LaneSizeMismatch);
194        }
195
196        unsafe {
197            if self.color_space == DataColorSpace::Lab
198                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
199                || self.color_space == DataColorSpace::Xyz
200            {
201                self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
202            } else {
203                match self.interpolation_method {
204                    #[cfg(feature = "options")]
205                    InterpolationMethod::Tetrahedral => {
206                        self.transform_chunk(src, dst, Box::new(TetrahedralSse::<GRID_SIZE> {}));
207                    }
208                    #[cfg(feature = "options")]
209                    InterpolationMethod::Pyramid => {
210                        self.transform_chunk(src, dst, Box::new(PyramidalSse::<GRID_SIZE> {}));
211                    }
212                    #[cfg(feature = "options")]
213                    InterpolationMethod::Prism => {
214                        self.transform_chunk(src, dst, Box::new(PrismaticSse::<GRID_SIZE> {}));
215                    }
216                    InterpolationMethod::Linear => {
217                        self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
218                    }
219                }
220            }
221        }
222
223        Ok(())
224    }
225}
226
227pub(crate) struct SseLut4x3Factory {}
228
229impl Lut4x3Factory for SseLut4x3Factory {
230    fn make_transform_4x3<
231        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
232        const LAYOUT: u8,
233        const GRID_SIZE: usize,
234        const BIT_DEPTH: usize,
235    >(
236        lut: Vec<f32>,
237        options: TransformOptions,
238        color_space: DataColorSpace,
239        is_linear: bool,
240    ) -> Arc<dyn TransformExecutor<T> + Sync + Send>
241    where
242        f32: AsPrimitive<T>,
243        u32: AsPrimitive<T>,
244        (): LutBarycentricReduction<T, u8>,
245        (): LutBarycentricReduction<T, u16>,
246    {
247        if options.prefer_fixed_point && BIT_DEPTH < 16 {
248            let q: f32 = if T::FINITE {
249                ((1i32 << BIT_DEPTH as i32) - 1) as f32
250            } else {
251                ((1i32 << 14i32) - 1) as f32
252            };
253            let lut = lut
254                .chunks_exact(3)
255                .map(|x| {
256                    SseAlignedI16x4([
257                        (x[0] * q).round() as i16,
258                        (x[1] * q).round() as i16,
259                        (x[2] * q).round() as i16,
260                        0,
261                    ])
262                })
263                .collect::<Vec<_>>();
264            return match options.barycentric_weight_scale {
265                BarycentricWeightScale::Low => {
266                    let bins = BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>();
267                    assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
268                    Arc::new(TransformLut4To3SseQ0_15::<
269                        T,
270                        u8,
271                        LAYOUT,
272                        GRID_SIZE,
273                        BIT_DEPTH,
274                        256,
275                        256,
276                    > {
277                        lut,
278                        interpolation_method: options.interpolation_method,
279                        weights: bins,
280                        _phantom: PhantomData,
281                        _phantom1: PhantomData,
282                        color_space,
283                        is_linear,
284                    })
285                }
286                #[cfg(feature = "options")]
287                BarycentricWeightScale::High => {
288                    let bins = BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>();
289                    assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
290                    Arc::new(TransformLut4To3SseQ0_15::<
291                        T,
292                        u16,
293                        LAYOUT,
294                        GRID_SIZE,
295                        BIT_DEPTH,
296                        65536,
297                        65536,
298                    > {
299                        lut,
300                        interpolation_method: options.interpolation_method,
301                        weights: bins,
302                        _phantom: PhantomData,
303                        _phantom1: PhantomData,
304                        color_space,
305                        is_linear,
306                    })
307                }
308            };
309        }
310        let lut = lut
311            .chunks_exact(3)
312            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
313            .collect::<Vec<_>>();
314        match options.barycentric_weight_scale {
315            BarycentricWeightScale::Low => {
316                let bins = BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>();
317                assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
318                Arc::new(
319                    TransformLut4To3Sse::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
320                        lut,
321                        _phantom: PhantomData,
322                        _phantom1: PhantomData,
323                        interpolation_method: options.interpolation_method,
324                        weights: bins,
325                        color_space,
326                        is_linear,
327                    },
328                )
329            }
330            #[cfg(feature = "options")]
331            BarycentricWeightScale::High => {
332                let bins = BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>();
333                assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
334                Arc::new(
335                    TransformLut4To3Sse::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
336                        lut,
337                        _phantom: PhantomData,
338                        _phantom1: PhantomData,
339                        interpolation_method: options.interpolation_method,
340                        weights: bins,
341                        color_space,
342                        is_linear,
343                    },
344                )
345            }
346        }
347    }
348}