1#![cfg(feature = "avx_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::avx::assert_barycentric_lut_size_precondition;
32use crate::conversions::avx::interpolator::*;
33use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
34use crate::conversions::avx::lut4_to_3_q0_15::TransformLut4To3AvxQ0_15;
35use crate::conversions::interpolator::BarycentricWeight;
36use crate::conversions::lut_transforms::Lut4x3Factory;
37use crate::transform::PointeeSizeExpressible;
38use crate::{
39 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
40 TransformExecutor, TransformOptions,
41};
42use num_traits::AsPrimitive;
43use std::arch::x86_64::*;
44use std::marker::PhantomData;
45use std::sync::Arc;
46
47struct TransformLut4To3Avx<
48 T,
49 U,
50 const LAYOUT: u8,
51 const GRID_SIZE: usize,
52 const BIT_DEPTH: usize,
53 const BINS: usize,
54 const BARYCENTRIC_BINS: usize,
55> {
56 lut: Vec<SseAlignedF32>,
57 _phantom: PhantomData<T>,
58 _phantom1: PhantomData<U>,
59 interpolation_method: InterpolationMethod,
60 weights: Box<[BarycentricWeight<f32>; BINS]>,
61 color_space: DataColorSpace,
62 is_linear: bool,
63}
64
65impl<
66 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
67 U: AsPrimitive<usize>,
68 const LAYOUT: u8,
69 const GRID_SIZE: usize,
70 const BIT_DEPTH: usize,
71 const BINS: usize,
72 const BARYCENTRIC_BINS: usize,
73> TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
74where
75 f32: AsPrimitive<T>,
76 u32: AsPrimitive<T>,
77 (): LutBarycentricReduction<T, U>,
78{
79 #[allow(unused_unsafe)]
80 #[target_feature(enable = "avx2", enable = "fma")]
81 unsafe fn transform_chunk(
82 &self,
83 src: &[T],
84 dst: &mut [T],
85 interpolator: Box<dyn AvxMdInterpolationDouble + Send + Sync>,
86 ) {
87 let cn = Layout::from(LAYOUT);
88 let channels = cn.channels();
89 let grid_size = GRID_SIZE as i32;
90 let grid_size3 = grid_size * grid_size * grid_size;
91
92 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
93 let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
94
95 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
96 let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
97 src[0],
98 );
99 let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
100 src[1],
101 );
102 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[2],
104 );
105 let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[3],
107 );
108
109 let k_weights = self.weights[k.as_()];
110
111 let w: i32 = k_weights.x;
112 let w_n: i32 = k_weights.x_n;
113 let t: f32 = k_weights.w;
114
115 let table1 = &self.lut[(w * grid_size3) as usize..];
116 let table2 = &self.lut[(w_n * grid_size3) as usize..];
117
118 let v = interpolator.inter3_sse(
119 table1,
120 table2,
121 c.as_(),
122 m.as_(),
123 y.as_(),
124 self.weights.as_slice(),
125 );
126 let (a0, b0) = (v.0.v, v.1.v);
127
128 if T::FINITE {
129 unsafe {
130 let t0 = _mm_set1_ps(t);
131 let hp = _mm_fnmadd_ps(a0, t0, a0);
132 let mut v = _mm_fmadd_ps(b0, t0, hp);
133 v = _mm_max_ps(v, _mm_setzero_ps());
134 v = _mm_mul_ps(v, value_scale);
135 v = _mm_min_ps(v, value_scale);
136 let jvz = _mm_cvtps_epi32(v);
137
138 let x = _mm_extract_epi32::<0>(jvz);
139 let y = _mm_extract_epi32::<1>(jvz);
140 let z = _mm_extract_epi32::<2>(jvz);
141
142 dst[cn.r_i()] = (x as u32).as_();
143 dst[cn.g_i()] = (y as u32).as_();
144 dst[cn.b_i()] = (z as u32).as_();
145 }
146 } else {
147 unsafe {
148 let t0 = _mm_set1_ps(t);
149 let hp = _mm_fnmadd_ps(a0, t0, a0);
150 let v = _mm_fmadd_ps(b0, t0, hp);
151 dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
152 dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
153 dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
154 }
155 }
156 if channels == 4 {
157 dst[cn.a_i()] = max_value;
158 }
159 }
160 }
161}
162
163impl<
164 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
165 U: AsPrimitive<usize>,
166 const LAYOUT: u8,
167 const GRID_SIZE: usize,
168 const BIT_DEPTH: usize,
169 const BINS: usize,
170 const BARYCENTRIC_BINS: usize,
171> TransformExecutor<T>
172 for TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
173where
174 f32: AsPrimitive<T>,
175 u32: AsPrimitive<T>,
176 (): LutBarycentricReduction<T, U>,
177{
178 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
179 let cn = Layout::from(LAYOUT);
180 let channels = cn.channels();
181 if src.len() % 4 != 0 {
182 return Err(CmsError::LaneMultipleOfChannels);
183 }
184 if dst.len() % channels != 0 {
185 return Err(CmsError::LaneMultipleOfChannels);
186 }
187 let src_chunks = src.len() / 4;
188 let dst_chunks = dst.len() / channels;
189 if src_chunks != dst_chunks {
190 return Err(CmsError::LaneSizeMismatch);
191 }
192
193 unsafe {
194 if self.color_space == DataColorSpace::Lab
195 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
196 || self.color_space == DataColorSpace::Xyz
197 {
198 self.transform_chunk(src, dst, Box::new(TrilinearAvxFmaDouble::<GRID_SIZE> {}));
199 } else {
200 match self.interpolation_method {
201 #[cfg(feature = "options")]
202 InterpolationMethod::Tetrahedral => {
203 self.transform_chunk(
204 src,
205 dst,
206 Box::new(TetrahedralAvxFmaDouble::<GRID_SIZE> {}),
207 );
208 }
209 #[cfg(feature = "options")]
210 InterpolationMethod::Pyramid => {
211 self.transform_chunk(
212 src,
213 dst,
214 Box::new(PyramidAvxFmaDouble::<GRID_SIZE> {}),
215 );
216 }
217 #[cfg(feature = "options")]
218 InterpolationMethod::Prism => {
219 self.transform_chunk(
220 src,
221 dst,
222 Box::new(PrismaticAvxFmaDouble::<GRID_SIZE> {}),
223 );
224 }
225 InterpolationMethod::Linear => {
226 self.transform_chunk(
227 src,
228 dst,
229 Box::new(TrilinearAvxFmaDouble::<GRID_SIZE> {}),
230 );
231 }
232 }
233 }
234 }
235
236 Ok(())
237 }
238}
239
240pub(crate) struct AvxLut4x3Factory {}
241
242impl Lut4x3Factory for AvxLut4x3Factory {
243 fn make_transform_4x3<
244 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
245 const LAYOUT: u8,
246 const GRID_SIZE: usize,
247 const BIT_DEPTH: usize,
248 >(
249 lut: Vec<f32>,
250 options: TransformOptions,
251 color_space: DataColorSpace,
252 is_linear: bool,
253 ) -> Arc<dyn TransformExecutor<T> + Send + Sync>
254 where
255 f32: AsPrimitive<T>,
256 u32: AsPrimitive<T>,
257 (): LutBarycentricReduction<T, u8>,
258 (): LutBarycentricReduction<T, u16>,
259 {
260 if options.prefer_fixed_point && BIT_DEPTH < 16 {
261 let q: f32 = if T::FINITE {
262 ((1i32 << BIT_DEPTH as i32) - 1) as f32
263 } else {
264 ((1i32 << 14i32) - 1) as f32
265 };
266 let lut = lut
267 .chunks_exact(3)
268 .map(|x| {
269 AvxAlignedI16([
270 (x[0] * q).round() as i16,
271 (x[1] * q).round() as i16,
272 (x[2] * q).round() as i16,
273 0,
274 ])
275 })
276 .collect::<Vec<_>>();
277 return match options.barycentric_weight_scale {
278 BarycentricWeightScale::Low => {
279 let bins = BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>();
280 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
281 Arc::new(TransformLut4To3AvxQ0_15::<
282 T,
283 u8,
284 LAYOUT,
285 GRID_SIZE,
286 BIT_DEPTH,
287 256,
288 256,
289 > {
290 lut,
291 interpolation_method: options.interpolation_method,
292 weights: bins,
293 _phantom: PhantomData,
294 _phantom1: PhantomData,
295 color_space,
296 is_linear,
297 })
298 }
299 #[cfg(feature = "options")]
300 BarycentricWeightScale::High => {
301 let bins = BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>();
302 assert_barycentric_lut_size_precondition::<i16, GRID_SIZE>(bins.as_slice());
303 Arc::new(TransformLut4To3AvxQ0_15::<
304 T,
305 u16,
306 LAYOUT,
307 GRID_SIZE,
308 BIT_DEPTH,
309 65536,
310 65536,
311 > {
312 lut,
313 interpolation_method: options.interpolation_method,
314 weights: bins,
315 _phantom: PhantomData,
316 _phantom1: PhantomData,
317 color_space,
318 is_linear,
319 })
320 }
321 };
322 }
323 assert!(
324 std::arch::is_x86_feature_detected!("fma"),
325 "Internal configuration error, this feature might not be called without `fma` feature"
326 );
327 let lut = lut
328 .chunks_exact(3)
329 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
330 .collect::<Vec<_>>();
331 match options.barycentric_weight_scale {
332 BarycentricWeightScale::Low => {
333 let bins = BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>();
334 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
335 Arc::new(
336 TransformLut4To3Avx::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
337 lut,
338 interpolation_method: options.interpolation_method,
339 weights: bins,
340 _phantom: PhantomData,
341 _phantom1: PhantomData,
342 color_space,
343 is_linear,
344 },
345 )
346 }
347 #[cfg(feature = "options")]
348 BarycentricWeightScale::High => {
349 let bins = BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>();
350 assert_barycentric_lut_size_precondition::<f32, GRID_SIZE>(bins.as_slice());
351 Arc::new(
352 TransformLut4To3Avx::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
353 lut,
354 interpolation_method: options.interpolation_method,
355 weights: bins,
356 _phantom: PhantomData,
357 _phantom1: PhantomData,
358 color_space,
359 is_linear,
360 },
361 )
362 }
363 }
364 }
365}