1#![cfg(feature = "sse_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::interpolator::BarycentricWeight;
32use crate::conversions::lut_transforms::Lut4x3Factory;
33use crate::conversions::sse::assert_barycentric_lut_size_precondition;
34use crate::conversions::sse::interpolator::*;
35use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
36use crate::conversions::sse::lut4_to_3_q0_15::TransformLut4To3SseQ0_15;
37use crate::transform::PointeeSizeExpressible;
38use crate::{
39 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
40 TransformExecutor, TransformOptions,
41};
42use num_traits::AsPrimitive;
43#[cfg(target_arch = "x86")]
44use std::arch::x86::*;
45#[cfg(target_arch = "x86_64")]
46use std::arch::x86_64::*;
47use std::marker::PhantomData;
48use std::sync::Arc;
49
50struct TransformLut4To3Sse<
51 T,
52 U,
53 const LAYOUT: u8,
54 const GRID_SIZE: usize,
55 const BIT_DEPTH: usize,
56 const BINS: usize,
57 const BARYCENTRIC_BINS: usize,
58> {
59 lut: Vec<SseAlignedF32>,
60 _phantom: PhantomData<T>,
61 _phantom1: PhantomData<U>,
62 interpolation_method: InterpolationMethod,
63 weights: Box<[BarycentricWeight<f32>; BINS]>,
64 color_space: DataColorSpace,
65 is_linear: bool,
66}
67
68impl<
69 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
70 U: AsPrimitive<usize>,
71 const LAYOUT: u8,
72 const GRID_SIZE: usize,
73 const BIT_DEPTH: usize,
74 const BINS: usize,
75 const BARYCENTRIC_BINS: usize,
76> TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
77where
78 f32: AsPrimitive<T>,
79 u32: AsPrimitive<T>,
80 (): LutBarycentricReduction<T, U>,
81{
82 #[allow(unused_unsafe)]
83 #[target_feature(enable = "sse4.1")]
84 unsafe fn transform_chunk(
85 &self,
86 src: &[T],
87 dst: &mut [T],
88 interpolator: Box<dyn SseMdInterpolation + Send + Sync>,
89 ) {
90 let cn = Layout::from(LAYOUT);
91 let channels = cn.channels();
92 let grid_size = GRID_SIZE as i32;
93 let grid_size3 = grid_size * grid_size * grid_size;
94
95 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
96 let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
97
98 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
99 let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
100 src[0],
101 );
102 let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[1],
104 );
105 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[2],
107 );
108 let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
109 src[3],
110 );
111
112 let k_weights = self.weights[k.as_()];
113
114 let w: i32 = k_weights.x;
115 let w_n: i32 = k_weights.x_n;
116 let t: f32 = k_weights.w;
117
118 let table1 = &self.lut[(w * grid_size3) as usize..];
119 let table2 = &self.lut[(w_n * grid_size3) as usize..];
120
121 let a0 = interpolator
122 .inter3_sse(table1, c.as_(), m.as_(), y.as_(), self.weights.as_slice())
123 .v;
124 let b0 = interpolator
125 .inter3_sse(table2, c.as_(), m.as_(), y.as_(), self.weights.as_slice())
126 .v;
127
128 if T::FINITE {
129 unsafe {
130 let t0 = _mm_set1_ps(t);
131 let ones = _mm_set1_ps(1f32);
132 let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
133 let mut v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
134 v = _mm_max_ps(v, _mm_setzero_ps());
135 v = _mm_mul_ps(v, value_scale);
136 v = _mm_min_ps(v, value_scale);
137 let jvz = _mm_cvtps_epi32(v);
138
139 let x = _mm_extract_epi32::<0>(jvz);
140 let y = _mm_extract_epi32::<1>(jvz);
141 let z = _mm_extract_epi32::<2>(jvz);
142
143 dst[cn.r_i()] = (x as u32).as_();
144 dst[cn.g_i()] = (y as u32).as_();
145 dst[cn.b_i()] = (z as u32).as_();
146 }
147 } else {
148 unsafe {
149 let t0 = _mm_set1_ps(t);
150 let ones = _mm_set1_ps(1f32);
151 let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
152 let v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
153
154 dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
155 dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
156 dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
157 }
158 }
159 if channels == 4 {
160 dst[cn.a_i()] = max_value;
161 }
162 }
163 }
164}
165
166impl<
167 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
168 U: AsPrimitive<usize>,
169 const LAYOUT: u8,
170 const GRID_SIZE: usize,
171 const BIT_DEPTH: usize,
172 const BINS: usize,
173 const BARYCENTRIC_BINS: usize,
174> TransformExecutor<T>
175 for TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
176where
177 f32: AsPrimitive<T>,
178 u32: AsPrimitive<T>,
179 (): LutBarycentricReduction<T, U>,
180{
181 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
182 let cn = Layout::from(LAYOUT);
183 let channels = cn.channels();
184 if src.len() % 4 != 0 {
185 return Err(CmsError::LaneMultipleOfChannels);
186 }
187 if dst.len() % channels != 0 {
188 return Err(CmsError::LaneMultipleOfChannels);
189 }
190 let src_chunks = src.len() / 4;
191 let dst_chunks = dst.len() / channels;
192 if src_chunks != dst_chunks {
193 return Err(CmsError::LaneSizeMismatch);
194 }
195
196 unsafe {
197 if self.color_space == DataColorSpace::Lab
198 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
199 || self.color_space == DataColorSpace::Xyz
200 {
201 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
202 } else {
203 match self.interpolation_method {
204 #[cfg(feature = "options")]
205 InterpolationMethod::Tetrahedral => {
206 self.transform_chunk(src, dst, Box::new(TetrahedralSse::<GRID_SIZE> {}));
207 }
208 #[cfg(feature = "options")]
209 InterpolationMethod::Pyramid => {
210 self.transform_chunk(src, dst, Box::new(PyramidalSse::<GRID_SIZE> {}));
211 }
212 #[cfg(feature = "options")]
213 InterpolationMethod::Prism => {
214 self.transform_chunk(src, dst, Box::new(PrismaticSse::<GRID_SIZE> {}));
215 }
216 InterpolationMethod::Linear => {
217 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
218 }
219 }
220 }
221 }
222
223 Ok(())
224 }
225}
226
227pub(crate) struct SseLut4x3Factory {}
228
229impl Lut4x3Factory for SseLut4x3Factory {
230 fn make_transform_4x3<
231 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
232 const LAYOUT: u8,
233 const GRID_SIZE: usize,
234 const BIT_DEPTH: usize,
235 >(
236 lut: Vec<f32>,
237 options: TransformOptions,
238 color_space: DataColorSpace,
239 is_linear: bool,
240 ) -> Arc<dyn TransformExecutor<T> + Sync + Send>
241 where
242 f32: AsPrimitive<T>,
243 u32: AsPrimitive<T>,
244 (): LutBarycentricReduction<T, u8>,
245 (): LutBarycentricReduction<T, u16>,
246 {
247 if options.prefer_fixed_point && BIT_DEPTH < 16 {
248 let q: f32 = if T::FINITE {
249 ((1i32 << BIT_DEPTH as i32) - 1) as f32
250 } else {
251 ((1i32 << 14i32) - 1) as f32
252 };
253 let lut = lut
254 .chunks_exact(3)
255 .map(|x| {
256 SseAlignedI16x4([
257 (x[0] * q).round() as i16,
258 (x[1] * q).round() as i16,
259 (x[2] * q).round() as i16,
260 0,
261 ])
262 })
263 .collect::<Vec<_>>();
264 return match options.barycentric_weight_scale {
265 BarycentricWeightScale::Low => {
266 let bins = BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>();
267 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
268 Arc::new(TransformLut4To3SseQ0_15::<
269 T,
270 u8,
271 LAYOUT,
272 GRID_SIZE,
273 BIT_DEPTH,
274 256,
275 256,
276 > {
277 lut,
278 interpolation_method: options.interpolation_method,
279 weights: bins,
280 _phantom: PhantomData,
281 _phantom1: PhantomData,
282 color_space,
283 is_linear,
284 })
285 }
286 #[cfg(feature = "options")]
287 BarycentricWeightScale::High => {
288 let bins = BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>();
289 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
290 Arc::new(TransformLut4To3SseQ0_15::<
291 T,
292 u16,
293 LAYOUT,
294 GRID_SIZE,
295 BIT_DEPTH,
296 65536,
297 65536,
298 > {
299 lut,
300 interpolation_method: options.interpolation_method,
301 weights: bins,
302 _phantom: PhantomData,
303 _phantom1: PhantomData,
304 color_space,
305 is_linear,
306 })
307 }
308 };
309 }
310 let lut = lut
311 .chunks_exact(3)
312 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
313 .collect::<Vec<_>>();
314 match options.barycentric_weight_scale {
315 BarycentricWeightScale::Low => {
316 let bins = BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>();
317 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
318 Arc::new(
319 TransformLut4To3Sse::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
320 lut,
321 _phantom: PhantomData,
322 _phantom1: PhantomData,
323 interpolation_method: options.interpolation_method,
324 weights: bins,
325 color_space,
326 is_linear,
327 },
328 )
329 }
330 #[cfg(feature = "options")]
331 BarycentricWeightScale::High => {
332 let bins = BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>();
333 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
334 Arc::new(
335 TransformLut4To3Sse::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
336 lut,
337 _phantom: PhantomData,
338 _phantom1: PhantomData,
339 interpolation_method: options.interpolation_method,
340 weights: bins,
341 color_space,
342 is_linear,
343 },
344 )
345 }
346 }
347 }
348}