1#![cfg(feature = "avx_luts")]
30use crate::conversions::LutBarycentricReduction;
31use crate::conversions::avx::interpolator_q0_15::*;
32use crate::conversions::interpolator::BarycentricWeight;
33use crate::transform::PointeeSizeExpressible;
34use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
35use num_traits::AsPrimitive;
36use std::arch::x86_64::*;
37use std::marker::PhantomData;
38
39pub(crate) struct TransformLut3x3AvxQ0_15<
40 T,
41 U,
42 const SRC_LAYOUT: u8,
43 const DST_LAYOUT: u8,
44 const GRID_SIZE: usize,
45 const BIT_DEPTH: usize,
46 const BINS: usize,
47 const BARYCENTRIC_BINS: usize,
48> {
49 pub(crate) lut: Vec<AvxAlignedI16>,
50 pub(crate) _phantom: PhantomData<T>,
51 pub(crate) _phantom2: PhantomData<U>,
52 pub(crate) interpolation_method: InterpolationMethod,
53 pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
54 pub(crate) color_space: DataColorSpace,
55 pub(crate) is_linear: bool,
56}
57
58impl<
59 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
60 U: AsPrimitive<usize>,
61 const SRC_LAYOUT: u8,
62 const DST_LAYOUT: u8,
63 const GRID_SIZE: usize,
64 const BIT_DEPTH: usize,
65 const BINS: usize,
66 const BARYCENTRIC_BINS: usize,
67>
68 TransformLut3x3AvxQ0_15<
69 T,
70 U,
71 SRC_LAYOUT,
72 DST_LAYOUT,
73 GRID_SIZE,
74 BIT_DEPTH,
75 BINS,
76 BARYCENTRIC_BINS,
77 >
78where
79 f32: AsPrimitive<T>,
80 u32: AsPrimitive<T>,
81 (): LutBarycentricReduction<T, U>,
82{
83 #[allow(unused_unsafe)]
84 #[target_feature(enable = "avx2")]
85 unsafe fn transform_chunk(
86 &self,
87 src: &[T],
88 dst: &mut [T],
89 interpolator: Box<dyn AvxMdInterpolationQ0_15 + Send + Sync>,
90 ) {
91 unsafe {
92 let src_cn = Layout::from(SRC_LAYOUT);
93 let src_channels = src_cn.channels();
94
95 let dst_cn = Layout::from(DST_LAYOUT);
96 let dst_channels = dst_cn.channels();
97
98 let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
99 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
100 let v_max_scale = if T::FINITE {
101 _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
102 } else {
103 _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
104 };
105
106 for (src, dst) in src
107 .chunks_exact(src_channels)
108 .zip(dst.chunks_exact_mut(dst_channels))
109 {
110 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
111 src[src_cn.r_i()],
112 );
113 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
114 src[src_cn.g_i()],
115 );
116 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
117 src[src_cn.b_i()],
118 );
119
120 let a = if src_channels == 4 {
121 src[src_cn.a_i()]
122 } else {
123 max_value
124 };
125
126 let v = interpolator.inter3_sse(
127 &self.lut,
128 x.as_(),
129 y.as_(),
130 z.as_(),
131 self.weights.as_slice(),
132 );
133 if T::FINITE {
134 let mut o = _mm_max_epi16(v.v, _mm_setzero_si128());
135 o = _mm_min_epi16(o, v_max_scale);
136 let x = _mm_extract_epi16::<0>(o);
137 let y = _mm_extract_epi16::<1>(o);
138 let z = _mm_extract_epi16::<2>(o);
139
140 dst[dst_cn.r_i()] = (x as u32).as_();
141 dst[dst_cn.g_i()] = (y as u32).as_();
142 dst[dst_cn.b_i()] = (z as u32).as_();
143 } else {
144 let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v.v));
145 r = _mm_mul_ps(r, f_value_scale);
146 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
147 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
148 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
149 }
150 if dst_channels == 4 {
151 dst[dst_cn.a_i()] = a;
152 }
153 }
154 }
155 }
156}
157
158impl<
159 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
160 U: AsPrimitive<usize>,
161 const SRC_LAYOUT: u8,
162 const DST_LAYOUT: u8,
163 const GRID_SIZE: usize,
164 const BIT_DEPTH: usize,
165 const BINS: usize,
166 const BARYCENTRIC_BINS: usize,
167> TransformExecutor<T>
168 for TransformLut3x3AvxQ0_15<
169 T,
170 U,
171 SRC_LAYOUT,
172 DST_LAYOUT,
173 GRID_SIZE,
174 BIT_DEPTH,
175 BINS,
176 BARYCENTRIC_BINS,
177 >
178where
179 f32: AsPrimitive<T>,
180 u32: AsPrimitive<T>,
181 (): LutBarycentricReduction<T, U>,
182{
183 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
184 let src_cn = Layout::from(SRC_LAYOUT);
185 let src_channels = src_cn.channels();
186
187 let dst_cn = Layout::from(DST_LAYOUT);
188 let dst_channels = dst_cn.channels();
189 if src.len() % src_channels != 0 {
190 return Err(CmsError::LaneMultipleOfChannels);
191 }
192 if dst.len() % dst_channels != 0 {
193 return Err(CmsError::LaneMultipleOfChannels);
194 }
195 let src_chunks = src.len() / src_channels;
196 let dst_chunks = dst.len() / dst_channels;
197 if src_chunks != dst_chunks {
198 return Err(CmsError::LaneSizeMismatch);
199 }
200
201 unsafe {
202 if self.color_space == DataColorSpace::Lab
203 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
204 || self.color_space == DataColorSpace::Xyz
205 {
206 self.transform_chunk(src, dst, Box::new(TrilinearAvxQ0_15::<GRID_SIZE> {}));
207 } else {
208 match self.interpolation_method {
209 #[cfg(feature = "options")]
210 InterpolationMethod::Tetrahedral => {
211 self.transform_chunk(
212 src,
213 dst,
214 Box::new(TetrahedralAvxQ0_15::<GRID_SIZE> {}),
215 );
216 }
217 #[cfg(feature = "options")]
218 InterpolationMethod::Pyramid => {
219 self.transform_chunk(src, dst, Box::new(PyramidalAvxQ0_15::<GRID_SIZE> {}));
220 }
221 #[cfg(feature = "options")]
222 InterpolationMethod::Prism => {
223 self.transform_chunk(src, dst, Box::new(PrismaticAvxQ0_15::<GRID_SIZE> {}));
224 }
225 InterpolationMethod::Linear => {
226 self.transform_chunk(src, dst, Box::new(TrilinearAvxQ0_15::<GRID_SIZE> {}));
227 }
228 }
229 }
230 }
231 Ok(())
232 }
233}