1#![cfg(feature = "avx_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::avx::assert_barycentric_lut_size_precondition;
32use crate::conversions::avx::interpolator::*;
33use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
34use crate::conversions::avx::t_lut3_to_3_q0_15::TransformLut3x3AvxQ0_15;
35use crate::conversions::interpolator::BarycentricWeight;
36use crate::conversions::lut_transforms::Lut3x3Factory;
37use crate::transform::PointeeSizeExpressible;
38use crate::{
39 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
40 TransformExecutor, TransformOptions,
41};
42use num_traits::AsPrimitive;
43use std::arch::x86_64::*;
44use std::marker::PhantomData;
45use std::sync::Arc;
46
47struct TransformLut3x3AvxFma<
48 T,
49 U,
50 const SRC_LAYOUT: u8,
51 const DST_LAYOUT: u8,
52 const GRID_SIZE: usize,
53 const BIT_DEPTH: usize,
54 const BINS: usize,
55 const BARYCENTRIC_BINS: usize,
56> {
57 lut: Vec<SseAlignedF32>,
58 _phantom: PhantomData<T>,
59 _phantom2: PhantomData<U>,
60 interpolation_method: InterpolationMethod,
61 weights: Box<[BarycentricWeight<f32>; BINS]>,
62 color_space: DataColorSpace,
63 is_linear: bool,
64}
65
66impl<
67 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
68 U: AsPrimitive<usize>,
69 const SRC_LAYOUT: u8,
70 const DST_LAYOUT: u8,
71 const GRID_SIZE: usize,
72 const BIT_DEPTH: usize,
73 const BINS: usize,
74 const BARYCENTRIC_BINS: usize,
75> TransformLut3x3AvxFma<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
76where
77 f32: AsPrimitive<T>,
78 u32: AsPrimitive<T>,
79 (): LutBarycentricReduction<T, U>,
80{
81 #[allow(unused_unsafe)]
82 #[target_feature(enable = "avx2", enable = "fma")]
83 unsafe fn transform_chunk(
84 &self,
85 src: &[T],
86 dst: &mut [T],
87 interpolator: Box<dyn AvxMdInterpolation + Send + Sync>,
88 ) {
89 let src_cn = Layout::from(SRC_LAYOUT);
90 let src_channels = src_cn.channels();
91
92 let dst_cn = Layout::from(DST_LAYOUT);
93 let dst_channels = dst_cn.channels();
94
95 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
96 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
97
98 for (src, dst) in src
99 .chunks_exact(src_channels)
100 .zip(dst.chunks_exact_mut(dst_channels))
101 {
102 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[src_cn.r_i()],
104 );
105 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[src_cn.g_i()],
107 );
108 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
109 src[src_cn.b_i()],
110 );
111
112 let a = if src_channels == 4 {
113 src[src_cn.a_i()]
114 } else {
115 max_value
116 };
117
118 let v = interpolator.inter3_sse(
119 &self.lut,
120 x.as_(),
121 y.as_(),
122 z.as_(),
123 self.weights.as_slice(),
124 );
125 if T::FINITE {
126 unsafe {
127 let mut r = _mm_mul_ps(v.v, value_scale);
128 r = _mm_max_ps(r, _mm_setzero_ps());
129 r = _mm_min_ps(r, value_scale);
130 let jvz = _mm_cvtps_epi32(r);
131
132 let x = _mm_extract_epi32::<0>(jvz);
133 let y = _mm_extract_epi32::<1>(jvz);
134 let z = _mm_extract_epi32::<2>(jvz);
135
136 dst[dst_cn.r_i()] = (x as u32).as_();
137 dst[dst_cn.g_i()] = (y as u32).as_();
138 dst[dst_cn.b_i()] = (z as u32).as_();
139 }
140 } else {
141 unsafe {
142 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
143 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
144 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
145 }
146 }
147 if dst_channels == 4 {
148 dst[dst_cn.a_i()] = a;
149 }
150 }
151 }
152}
153
154impl<
155 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
156 U: AsPrimitive<usize>,
157 const SRC_LAYOUT: u8,
158 const DST_LAYOUT: u8,
159 const GRID_SIZE: usize,
160 const BIT_DEPTH: usize,
161 const BINS: usize,
162 const BARYCENTRIC_BINS: usize,
163> TransformExecutor<T>
164 for TransformLut3x3AvxFma<
165 T,
166 U,
167 SRC_LAYOUT,
168 DST_LAYOUT,
169 GRID_SIZE,
170 BIT_DEPTH,
171 BINS,
172 BARYCENTRIC_BINS,
173 >
174where
175 f32: AsPrimitive<T>,
176 u32: AsPrimitive<T>,
177 (): LutBarycentricReduction<T, U>,
178{
179 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
180 let src_cn = Layout::from(SRC_LAYOUT);
181 let src_channels = src_cn.channels();
182
183 let dst_cn = Layout::from(DST_LAYOUT);
184 let dst_channels = dst_cn.channels();
185 if src.len() % src_channels != 0 {
186 return Err(CmsError::LaneMultipleOfChannels);
187 }
188 if dst.len() % dst_channels != 0 {
189 return Err(CmsError::LaneMultipleOfChannels);
190 }
191 let src_chunks = src.len() / src_channels;
192 let dst_chunks = dst.len() / dst_channels;
193 if src_chunks != dst_chunks {
194 return Err(CmsError::LaneSizeMismatch);
195 }
196
197 unsafe {
198 if self.color_space == DataColorSpace::Lab
199 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
200 || self.color_space == DataColorSpace::Xyz
201 {
202 self.transform_chunk(src, dst, Box::new(TrilinearAvxFma::<GRID_SIZE> {}));
203 } else {
204 match self.interpolation_method {
205 #[cfg(feature = "options")]
206 InterpolationMethod::Tetrahedral => {
207 self.transform_chunk(src, dst, Box::new(TetrahedralAvxFma::<GRID_SIZE> {}));
208 }
209 #[cfg(feature = "options")]
210 InterpolationMethod::Pyramid => {
211 self.transform_chunk(src, dst, Box::new(PyramidalAvxFma::<GRID_SIZE> {}));
212 }
213 #[cfg(feature = "options")]
214 InterpolationMethod::Prism => {
215 self.transform_chunk(src, dst, Box::new(PrismaticAvxFma::<GRID_SIZE> {}));
216 }
217 InterpolationMethod::Linear => {
218 self.transform_chunk(src, dst, Box::new(TrilinearAvxFma::<GRID_SIZE> {}));
219 }
220 }
221 }
222 }
223 Ok(())
224 }
225}
226
227pub(crate) struct AvxLut3x3Factory {}
228
229impl Lut3x3Factory for AvxLut3x3Factory {
230 fn make_transform_3x3<
231 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
232 const SRC_LAYOUT: u8,
233 const DST_LAYOUT: u8,
234 const GRID_SIZE: usize,
235 const BIT_DEPTH: usize,
236 >(
237 lut: Vec<f32>,
238 options: TransformOptions,
239 color_space: DataColorSpace,
240 is_linear: bool,
241 ) -> Arc<dyn TransformExecutor<T> + Send + Sync>
242 where
243 f32: AsPrimitive<T>,
244 u32: AsPrimitive<T>,
245 (): LutBarycentricReduction<T, u8>,
246 (): LutBarycentricReduction<T, u16>,
247 {
248 if options.prefer_fixed_point && BIT_DEPTH < 16 {
249 let q: f32 = if T::FINITE {
250 ((1i32 << BIT_DEPTH as i32) - 1) as f32
251 } else {
252 ((1i32 << 14i32) - 1) as f32
253 };
254 let lut = lut
255 .chunks_exact(3)
256 .map(|x| {
257 AvxAlignedI16([
258 (x[0] * q).round() as i16,
259 (x[1] * q).round() as i16,
260 (x[2] * q).round() as i16,
261 0,
262 ])
263 })
264 .collect::<Vec<_>>();
265 return match options.barycentric_weight_scale {
266 BarycentricWeightScale::Low => {
267 let bins = BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>();
268 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
269 Arc::new(TransformLut3x3AvxQ0_15::<
270 T,
271 u8,
272 SRC_LAYOUT,
273 DST_LAYOUT,
274 GRID_SIZE,
275 BIT_DEPTH,
276 256,
277 256,
278 > {
279 lut,
280 _phantom: PhantomData,
281 _phantom2: PhantomData,
282 interpolation_method: options.interpolation_method,
283 weights: bins,
284 color_space,
285 is_linear,
286 })
287 }
288 #[cfg(feature = "options")]
289 BarycentricWeightScale::High => {
290 let bins = BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>();
291 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
292 Arc::new(TransformLut3x3AvxQ0_15::<
293 T,
294 u16,
295 SRC_LAYOUT,
296 DST_LAYOUT,
297 GRID_SIZE,
298 BIT_DEPTH,
299 65536,
300 65536,
301 > {
302 lut,
303 _phantom: PhantomData,
304 _phantom2: PhantomData,
305 interpolation_method: options.interpolation_method,
306 weights: bins,
307 color_space,
308 is_linear,
309 })
310 }
311 };
312 }
313 assert!(
314 std::arch::is_x86_feature_detected!("fma"),
315 "Internal configuration error, this might not be called without `fma` feature"
316 );
317 let lut = lut
318 .chunks_exact(3)
319 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
320 .collect::<Vec<_>>();
321 match options.barycentric_weight_scale {
322 BarycentricWeightScale::Low => {
323 let bins = BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>();
324 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
325 Arc::new(TransformLut3x3AvxFma::<
326 T,
327 u8,
328 SRC_LAYOUT,
329 DST_LAYOUT,
330 GRID_SIZE,
331 BIT_DEPTH,
332 256,
333 256,
334 > {
335 lut,
336 _phantom: PhantomData,
337 _phantom2: PhantomData,
338 interpolation_method: options.interpolation_method,
339 weights: bins,
340 color_space,
341 is_linear,
342 })
343 }
344 #[cfg(feature = "options")]
345 BarycentricWeightScale::High => {
346 let bins = BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>();
347 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
348 Arc::new(TransformLut3x3AvxFma::<
349 T,
350 u16,
351 SRC_LAYOUT,
352 DST_LAYOUT,
353 GRID_SIZE,
354 BIT_DEPTH,
355 65536,
356 65536,
357 > {
358 lut,
359 _phantom: PhantomData,
360 _phantom2: PhantomData,
361 interpolation_method: options.interpolation_method,
362 weights: bins,
363 color_space,
364 is_linear,
365 })
366 }
367 }
368 }
369}