1#![cfg(feature = "sse_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::interpolator::BarycentricWeight;
32use crate::conversions::lut_transforms::Lut3x3Factory;
33use crate::conversions::sse::assert_barycentric_lut_size_precondition;
34use crate::conversions::sse::interpolator::*;
35use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
36use crate::conversions::sse::t_lut3_to_3_q0_15::TransformLut3x3SseQ0_15;
37use crate::transform::PointeeSizeExpressible;
38use crate::{
39 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
40 TransformExecutor, TransformOptions,
41};
42use num_traits::AsPrimitive;
43#[cfg(target_arch = "x86")]
44use std::arch::x86::*;
45#[cfg(target_arch = "x86_64")]
46use std::arch::x86_64::*;
47use std::marker::PhantomData;
48use std::sync::Arc;
49
50struct TransformLut3x3Sse<
51 T,
52 U,
53 const SRC_LAYOUT: u8,
54 const DST_LAYOUT: u8,
55 const GRID_SIZE: usize,
56 const BIT_DEPTH: usize,
57 const BINS: usize,
58 const BARYCENTRIC_BINS: usize,
59> {
60 lut: Vec<SseAlignedF32>,
61 _phantom: PhantomData<T>,
62 _phantom2: PhantomData<U>,
63 interpolation_method: InterpolationMethod,
64 weights: Box<[BarycentricWeight<f32>; BINS]>,
65 color_space: DataColorSpace,
66 is_linear: bool,
67}
68
69impl<
70 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
71 U: AsPrimitive<usize>,
72 const SRC_LAYOUT: u8,
73 const DST_LAYOUT: u8,
74 const GRID_SIZE: usize,
75 const BIT_DEPTH: usize,
76 const BINS: usize,
77 const BARYCENTRIC_BINS: usize,
78> TransformLut3x3Sse<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
79where
80 f32: AsPrimitive<T>,
81 u32: AsPrimitive<T>,
82 (): LutBarycentricReduction<T, U>,
83{
84 #[allow(unused_unsafe)]
85 #[target_feature(enable = "sse4.1")]
86 unsafe fn transform_chunk(
87 &self,
88 src: &[T],
89 dst: &mut [T],
90 interpolator: Box<dyn SseMdInterpolation + Send + Sync>,
91 ) {
92 let src_cn = Layout::from(SRC_LAYOUT);
93 let src_channels = src_cn.channels();
94
95 let dst_cn = Layout::from(DST_LAYOUT);
96 let dst_channels = dst_cn.channels();
97
98 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
99 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
100
101 for (src, dst) in src
102 .chunks_exact(src_channels)
103 .zip(dst.chunks_exact_mut(dst_channels))
104 {
105 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[src_cn.r_i()],
107 );
108 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
109 src[src_cn.g_i()],
110 );
111 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
112 src[src_cn.b_i()],
113 );
114
115 let a = if src_channels == 4 {
116 src[src_cn.a_i()]
117 } else {
118 max_value
119 };
120
121 let v = interpolator.inter3_sse(
122 &self.lut,
123 x.as_(),
124 y.as_(),
125 z.as_(),
126 self.weights.as_slice(),
127 );
128 if T::FINITE {
129 unsafe {
130 let mut r = _mm_mul_ps(v.v, value_scale);
131 r = _mm_max_ps(r, _mm_setzero_ps());
132 r = _mm_min_ps(r, value_scale);
133 let jvz = _mm_cvtps_epi32(r);
134
135 let x = _mm_extract_epi32::<0>(jvz);
136 let y = _mm_extract_epi32::<1>(jvz);
137 let z = _mm_extract_epi32::<2>(jvz);
138
139 dst[dst_cn.r_i()] = (x as u32).as_();
140 dst[dst_cn.g_i()] = (y as u32).as_();
141 dst[dst_cn.b_i()] = (z as u32).as_();
142 }
143 } else {
144 unsafe {
145 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
146 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
147 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
148 }
149 }
150 if dst_channels == 4 {
151 dst[dst_cn.a_i()] = a;
152 }
153 }
154 }
155}
156
157impl<
158 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
159 U: AsPrimitive<usize>,
160 const SRC_LAYOUT: u8,
161 const DST_LAYOUT: u8,
162 const GRID_SIZE: usize,
163 const BIT_DEPTH: usize,
164 const BINS: usize,
165 const BARYCENTRIC_BINS: usize,
166> TransformExecutor<T>
167 for TransformLut3x3Sse<
168 T,
169 U,
170 SRC_LAYOUT,
171 DST_LAYOUT,
172 GRID_SIZE,
173 BIT_DEPTH,
174 BINS,
175 BARYCENTRIC_BINS,
176 >
177where
178 f32: AsPrimitive<T>,
179 u32: AsPrimitive<T>,
180 (): LutBarycentricReduction<T, U>,
181{
182 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
183 let src_cn = Layout::from(SRC_LAYOUT);
184 let src_channels = src_cn.channels();
185
186 let dst_cn = Layout::from(DST_LAYOUT);
187 let dst_channels = dst_cn.channels();
188 if src.len() % src_channels != 0 {
189 return Err(CmsError::LaneMultipleOfChannels);
190 }
191 if dst.len() % dst_channels != 0 {
192 return Err(CmsError::LaneMultipleOfChannels);
193 }
194 let src_chunks = src.len() / src_channels;
195 let dst_chunks = dst.len() / dst_channels;
196 if src_chunks != dst_chunks {
197 return Err(CmsError::LaneSizeMismatch);
198 }
199
200 unsafe {
201 if self.color_space == DataColorSpace::Lab
202 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
203 || self.color_space == DataColorSpace::Xyz
204 {
205 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
206 } else {
207 match self.interpolation_method {
208 #[cfg(feature = "options")]
209 InterpolationMethod::Tetrahedral => {
210 self.transform_chunk(src, dst, Box::new(TetrahedralSse::<GRID_SIZE> {}));
211 }
212 #[cfg(feature = "options")]
213 InterpolationMethod::Pyramid => {
214 self.transform_chunk(src, dst, Box::new(PyramidalSse::<GRID_SIZE> {}));
215 }
216 #[cfg(feature = "options")]
217 InterpolationMethod::Prism => {
218 self.transform_chunk(src, dst, Box::new(PrismaticSse::<GRID_SIZE> {}));
219 }
220 InterpolationMethod::Linear => {
221 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
222 }
223 }
224 }
225 }
226 Ok(())
227 }
228}
229
230pub(crate) struct SseLut3x3Factory {}
231
232impl Lut3x3Factory for SseLut3x3Factory {
233 fn make_transform_3x3<
234 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
235 const SRC_LAYOUT: u8,
236 const DST_LAYOUT: u8,
237 const GRID_SIZE: usize,
238 const BIT_DEPTH: usize,
239 >(
240 lut: Vec<f32>,
241 options: TransformOptions,
242 color_space: DataColorSpace,
243 is_linear: bool,
244 ) -> Arc<dyn TransformExecutor<T> + Sync + Send>
245 where
246 f32: AsPrimitive<T>,
247 u32: AsPrimitive<T>,
248 (): LutBarycentricReduction<T, u8>,
249 (): LutBarycentricReduction<T, u16>,
250 {
251 if options.prefer_fixed_point && BIT_DEPTH < 16 {
252 let q: f32 = if T::FINITE {
253 ((1i32 << BIT_DEPTH as i32) - 1) as f32
254 } else {
255 ((1i32 << 14i32) - 1) as f32
256 };
257 let lut = lut
258 .chunks_exact(3)
259 .map(|x| {
260 SseAlignedI16x4([
261 (x[0] * q).round() as i16,
262 (x[1] * q).round() as i16,
263 (x[2] * q).round() as i16,
264 0,
265 ])
266 })
267 .collect::<Vec<_>>();
268 return match options.barycentric_weight_scale {
269 BarycentricWeightScale::Low => {
270 let bins = BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>();
271 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
272 Arc::new(TransformLut3x3SseQ0_15::<
273 T,
274 u8,
275 SRC_LAYOUT,
276 DST_LAYOUT,
277 GRID_SIZE,
278 BIT_DEPTH,
279 256,
280 256,
281 > {
282 lut,
283 _phantom: PhantomData,
284 _phantom2: PhantomData,
285 interpolation_method: options.interpolation_method,
286 weights: bins,
287 color_space,
288 is_linear,
289 })
290 }
291 #[cfg(feature = "options")]
292 BarycentricWeightScale::High => {
293 let bins = BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>();
294 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
295 Arc::new(TransformLut3x3SseQ0_15::<
296 T,
297 u16,
298 SRC_LAYOUT,
299 DST_LAYOUT,
300 GRID_SIZE,
301 BIT_DEPTH,
302 65536,
303 65536,
304 > {
305 lut,
306 _phantom: PhantomData,
307 _phantom2: PhantomData,
308 interpolation_method: options.interpolation_method,
309 weights: bins,
310 color_space,
311 is_linear,
312 })
313 }
314 };
315 }
316 let lut = lut
317 .chunks_exact(3)
318 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
319 .collect::<Vec<_>>();
320 match options.barycentric_weight_scale {
321 BarycentricWeightScale::Low => {
322 let bins = BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>();
323 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
324 Arc::new(TransformLut3x3Sse::<
325 T,
326 u8,
327 SRC_LAYOUT,
328 DST_LAYOUT,
329 GRID_SIZE,
330 BIT_DEPTH,
331 256,
332 256,
333 > {
334 lut,
335 _phantom: PhantomData,
336 _phantom2: PhantomData,
337 interpolation_method: options.interpolation_method,
338 weights: bins,
339 color_space,
340 is_linear,
341 })
342 }
343 #[cfg(feature = "options")]
344 BarycentricWeightScale::High => {
345 let bins = BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>();
346 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
347 Arc::new(TransformLut3x3Sse::<
348 T,
349 u16,
350 SRC_LAYOUT,
351 DST_LAYOUT,
352 GRID_SIZE,
353 BIT_DEPTH,
354 65536,
355 65536,
356 > {
357 lut,
358 _phantom: PhantomData,
359 _phantom2: PhantomData,
360 interpolation_method: options.interpolation_method,
361 weights: bins,
362 color_space,
363 is_linear,
364 })
365 }
366 }
367 }
368}