1#![cfg(feature = "sse_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::interpolator::BarycentricWeight;
32use crate::conversions::sse::interpolator_q0_15::*;
33use crate::transform::PointeeSizeExpressible;
34use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
35use num_traits::AsPrimitive;
36#[cfg(target_arch = "x86")]
37use std::arch::x86::*;
38#[cfg(target_arch = "x86_64")]
39use std::arch::x86_64::*;
40use std::marker::PhantomData;
41
42pub(crate) struct TransformLut3x3SseQ0_15<
43 T,
44 U,
45 const SRC_LAYOUT: u8,
46 const DST_LAYOUT: u8,
47 const GRID_SIZE: usize,
48 const BIT_DEPTH: usize,
49 const BINS: usize,
50 const BARYCENTRIC_BINS: usize,
51> {
52 pub(crate) lut: Vec<SseAlignedI16x4>,
53 pub(crate) _phantom: PhantomData<T>,
54 pub(crate) _phantom2: PhantomData<U>,
55 pub(crate) interpolation_method: InterpolationMethod,
56 pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
57 pub(crate) color_space: DataColorSpace,
58 pub(crate) is_linear: bool,
59}
60
61impl<
62 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
63 U: AsPrimitive<usize>,
64 const SRC_LAYOUT: u8,
65 const DST_LAYOUT: u8,
66 const GRID_SIZE: usize,
67 const BIT_DEPTH: usize,
68 const BINS: usize,
69 const BARYCENTRIC_BINS: usize,
70>
71 TransformLut3x3SseQ0_15<
72 T,
73 U,
74 SRC_LAYOUT,
75 DST_LAYOUT,
76 GRID_SIZE,
77 BIT_DEPTH,
78 BINS,
79 BARYCENTRIC_BINS,
80 >
81where
82 f32: AsPrimitive<T>,
83 u32: AsPrimitive<T>,
84 (): LutBarycentricReduction<T, U>,
85{
86 #[allow(unused_unsafe)]
87 #[target_feature(enable = "sse4.1")]
88 #[inline(never)]
89 unsafe fn transform_chunk(
90 &self,
91 src: &[T],
92 dst: &mut [T],
93 interpolator: Box<dyn SseMdInterpolationQ0_15 + Send + Sync>,
94 ) {
95 unsafe {
96 let src_cn = Layout::from(SRC_LAYOUT);
97 let src_channels = src_cn.channels();
98
99 let dst_cn = Layout::from(DST_LAYOUT);
100 let dst_channels = dst_cn.channels();
101
102 let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
103 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
104 let v_max_scale = if T::FINITE {
105 _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
106 } else {
107 _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
108 };
109
110 for (src, dst) in src
111 .chunks_exact(src_channels)
112 .zip(dst.chunks_exact_mut(dst_channels))
113 {
114 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
115 src[src_cn.r_i()],
116 );
117 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
118 src[src_cn.g_i()],
119 );
120 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
121 src[src_cn.b_i()],
122 );
123
124 let a = if src_channels == 4 {
125 src[src_cn.a_i()]
126 } else {
127 max_value
128 };
129
130 let v = interpolator.inter3_sse(
131 &self.lut,
132 x.as_(),
133 y.as_(),
134 z.as_(),
135 self.weights.as_slice(),
136 );
137 if T::FINITE {
138 let mut o = _mm_max_epi16(v.v, _mm_setzero_si128());
139 o = _mm_min_epi16(o, v_max_scale);
140 let x = _mm_extract_epi16::<0>(o);
141 let y = _mm_extract_epi16::<1>(o);
142 let z = _mm_extract_epi16::<2>(o);
143
144 dst[dst_cn.r_i()] = (x as u32).as_();
145 dst[dst_cn.g_i()] = (y as u32).as_();
146 dst[dst_cn.b_i()] = (z as u32).as_();
147 } else {
148 let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v.v));
149 r = _mm_mul_ps(r, f_value_scale);
150 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
151 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
152 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
153 }
154 if dst_channels == 4 {
155 dst[dst_cn.a_i()] = a;
156 }
157 }
158 }
159 }
160}
161
162impl<
163 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
164 U: AsPrimitive<usize>,
165 const SRC_LAYOUT: u8,
166 const DST_LAYOUT: u8,
167 const GRID_SIZE: usize,
168 const BIT_DEPTH: usize,
169 const BINS: usize,
170 const BARYCENTRIC_BINS: usize,
171> TransformExecutor<T>
172 for TransformLut3x3SseQ0_15<
173 T,
174 U,
175 SRC_LAYOUT,
176 DST_LAYOUT,
177 GRID_SIZE,
178 BIT_DEPTH,
179 BINS,
180 BARYCENTRIC_BINS,
181 >
182where
183 f32: AsPrimitive<T>,
184 u32: AsPrimitive<T>,
185 (): LutBarycentricReduction<T, U>,
186{
187 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
188 let src_cn = Layout::from(SRC_LAYOUT);
189 let src_channels = src_cn.channels();
190
191 let dst_cn = Layout::from(DST_LAYOUT);
192 let dst_channels = dst_cn.channels();
193 if src.len() % src_channels != 0 {
194 return Err(CmsError::LaneMultipleOfChannels);
195 }
196 if dst.len() % dst_channels != 0 {
197 return Err(CmsError::LaneMultipleOfChannels);
198 }
199 let src_chunks = src.len() / src_channels;
200 let dst_chunks = dst.len() / dst_channels;
201 if src_chunks != dst_chunks {
202 return Err(CmsError::LaneSizeMismatch);
203 }
204
205 unsafe {
206 if self.color_space == DataColorSpace::Lab
207 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
208 || self.color_space == DataColorSpace::Xyz
209 {
210 self.transform_chunk(src, dst, Box::new(TrilinearSseQ0_15::<GRID_SIZE> {}));
211 } else {
212 match self.interpolation_method {
213 #[cfg(feature = "options")]
214 InterpolationMethod::Tetrahedral => {
215 self.transform_chunk(
216 src,
217 dst,
218 Box::new(TetrahedralSseQ0_15::<GRID_SIZE> {}),
219 );
220 }
221 #[cfg(feature = "options")]
222 InterpolationMethod::Pyramid => {
223 self.transform_chunk(src, dst, Box::new(PyramidalSseQ0_15::<GRID_SIZE> {}));
224 }
225 #[cfg(feature = "options")]
226 InterpolationMethod::Prism => {
227 self.transform_chunk(src, dst, Box::new(PrismaticSseQ0_15::<GRID_SIZE> {}));
228 }
229 InterpolationMethod::Linear => {
230 self.transform_chunk(src, dst, Box::new(TrilinearSseQ0_15::<GRID_SIZE> {}));
231 }
232 }
233 }
234 }
235 Ok(())
236 }
237}