1#![cfg(feature = "sse_shaper_fixed_point_paths")]
30use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFpOptVec;
31use crate::conversions::sse::SseAlignedU16;
32use crate::transform::PointeeSizeExpressible;
33use crate::{CmsError, Layout, TransformExecutor};
34use num_traits::AsPrimitive;
35#[cfg(target_arch = "x86")]
36use std::arch::x86::*;
37#[cfg(target_arch = "x86_64")]
38use std::arch::x86_64::*;
39
40#[inline(always)]
41#[allow(dead_code)]
42pub(crate) unsafe fn _xmm_load_epi32(f: &i32) -> __m128i {
43 let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
44 unsafe { _mm_castps_si128(_mm_load_ss(float_ref)) }
45}
46
47pub(crate) struct TransformShaperQ2_13OptSse<
48 T: Copy,
49 const SRC_LAYOUT: u8,
50 const DST_LAYOUT: u8,
51 const PRECISION: i32,
52> {
53 pub(crate) profile: TransformMatrixShaperFpOptVec<i32, i16, T>,
54 pub(crate) bit_depth: usize,
55 pub(crate) gamma_lut: usize,
56}
57
58impl<
59 T: Copy + PointeeSizeExpressible + 'static,
60 const SRC_LAYOUT: u8,
61 const DST_LAYOUT: u8,
62 const PRECISION: i32,
63> TransformShaperQ2_13OptSse<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
64where
65 u32: AsPrimitive<T>,
66{
67 #[target_feature(enable = "sse4.1")]
68 unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
69 let src_cn = Layout::from(SRC_LAYOUT);
70 let dst_cn = Layout::from(DST_LAYOUT);
71 let src_channels = src_cn.channels();
72 let dst_channels = dst_cn.channels();
73
74 let mut temporary = SseAlignedU16([0; 8]);
75
76 if src.len() / src_channels != dst.len() / dst_channels {
77 return Err(CmsError::LaneSizeMismatch);
78 }
79 if src.len() % src_channels != 0 {
80 return Err(CmsError::LaneMultipleOfChannels);
81 }
82 if dst.len() % dst_channels != 0 {
83 return Err(CmsError::LaneMultipleOfChannels);
84 }
85
86 let t = self.profile.adaptation_matrix.transpose();
87
88 let max_colors = ((1 << self.bit_depth) - 1).as_();
89
90 unsafe {
91 let m0 = _mm_setr_epi16(
92 t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
93 );
94 let m2 = _mm_setr_epi16(t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0);
95
96 let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
97 let rnd = _mm_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
98
99 let v_max_value = _mm_set1_epi32(self.gamma_lut as i32 - 1);
100
101 if T::FINITE {
103 let cap = (1 << self.bit_depth) - 1;
104 assert!(self.profile.linear.len() >= cap);
105 } else {
106 assert!(self.profile.linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
107 }
108
109 let lut_lin = &self.profile.linear;
110
111 for (src, dst) in src
112 .chunks_exact(src_channels)
113 .zip(dst.chunks_exact_mut(dst_channels))
114 {
115 let rp = lut_lin.get_unchecked(src[src_cn.r_i()]._as_usize());
116 let gp = lut_lin.get_unchecked(src[src_cn.g_i()]._as_usize());
117 let bp = lut_lin.get_unchecked(src[src_cn.b_i()]._as_usize());
118
119 let mut r = _xmm_load_epi32(rp);
120 let mut g = _xmm_load_epi32(gp);
121 let mut b = _xmm_load_epi32(bp);
122 let a = if src_channels == 4 {
123 src[src_cn.a_i()]
124 } else {
125 max_colors
126 };
127
128 r = _mm_shuffle_epi32::<0>(r);
129 g = _mm_shuffle_epi32::<0>(g);
130 b = _mm_shuffle_epi32::<0>(b);
131
132 g = _mm_slli_epi32::<16>(g);
133
134 let zrg0 = _mm_or_si128(r, g);
135 let zbz0 = _mm_or_si128(b, rnd);
136
137 let v0 = _mm_madd_epi16(zrg0, m0);
138 let v1 = _mm_madd_epi16(zbz0, m2);
139
140 let mut v = _mm_add_epi32(v0, v1);
141
142 v = _mm_srai_epi32::<PRECISION>(v);
143 v = _mm_max_epi32(v, _mm_setzero_si128());
144 v = _mm_min_epi32(v, v_max_value);
145
146 _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, v);
147
148 dst[dst_cn.r_i()] = self.profile.gamma[temporary.0[0] as usize];
149 dst[dst_cn.g_i()] = self.profile.gamma[temporary.0[2] as usize];
150 dst[dst_cn.b_i()] = self.profile.gamma[temporary.0[4] as usize];
151 if dst_channels == 4 {
152 dst[dst_cn.a_i()] = a;
153 }
154 }
155 }
156
157 Ok(())
158 }
159
160 #[cfg(feature = "in_place")]
161 #[target_feature(enable = "sse4.1")]
162 unsafe fn transform_in_place_impl(&self, in_out: &mut [T]) -> Result<(), CmsError> {
163 let src_cn = Layout::from(SRC_LAYOUT);
164 let src_channels = src_cn.channels();
165
166 assert_eq!(
167 SRC_LAYOUT, DST_LAYOUT,
168 "This is in-place transform, layout must not diverge"
169 );
170
171 let mut temporary = SseAlignedU16([0; 8]);
172
173 if in_out.len() % src_channels != 0 {
174 return Err(CmsError::LaneMultipleOfChannels);
175 }
176
177 let t = self.profile.adaptation_matrix.transpose();
178
179 let max_colors = ((1 << self.bit_depth) - 1).as_();
180
181 unsafe {
182 let m0 = _mm_setr_epi16(
183 t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
184 );
185 let m2 = _mm_setr_epi16(t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0);
186
187 let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
188 let rnd = _mm_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
189
190 let v_max_value = _mm_set1_epi32(self.gamma_lut as i32 - 1);
191
192 if T::FINITE {
194 let cap = (1 << self.bit_depth) - 1;
195 assert!(self.profile.linear.len() >= cap);
196 } else {
197 assert!(self.profile.linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
198 }
199
200 let lut_lin = &self.profile.linear;
201
202 for dst in in_out.chunks_exact_mut(src_channels) {
203 let rp = lut_lin.get_unchecked(dst[src_cn.r_i()]._as_usize());
204 let gp = lut_lin.get_unchecked(dst[src_cn.g_i()]._as_usize());
205 let bp = lut_lin.get_unchecked(dst[src_cn.b_i()]._as_usize());
206
207 let mut r = _xmm_load_epi32(rp);
208 let mut g = _xmm_load_epi32(gp);
209 let mut b = _xmm_load_epi32(bp);
210 let a = if src_channels == 4 {
211 dst[src_cn.a_i()]
212 } else {
213 max_colors
214 };
215
216 r = _mm_shuffle_epi32::<0>(r);
217 g = _mm_shuffle_epi32::<0>(g);
218 b = _mm_shuffle_epi32::<0>(b);
219
220 g = _mm_slli_epi32::<16>(g);
221
222 let zrg0 = _mm_or_si128(r, g);
223 let zbz0 = _mm_or_si128(b, rnd);
224
225 let v0 = _mm_madd_epi16(zrg0, m0);
226 let v1 = _mm_madd_epi16(zbz0, m2);
227
228 let mut v = _mm_add_epi32(v0, v1);
229
230 v = _mm_srai_epi32::<PRECISION>(v);
231 v = _mm_max_epi32(v, _mm_setzero_si128());
232 v = _mm_min_epi32(v, v_max_value);
233
234 _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, v);
235
236 dst[src_cn.r_i()] = self.profile.gamma[temporary.0[0] as usize];
237 dst[src_cn.g_i()] = self.profile.gamma[temporary.0[2] as usize];
238 dst[src_cn.b_i()] = self.profile.gamma[temporary.0[4] as usize];
239 if src_channels == 4 {
240 dst[src_cn.a_i()] = a;
241 }
242 }
243 }
244
245 Ok(())
246 }
247}
248
249impl<
250 T: Copy + PointeeSizeExpressible + 'static + Default,
251 const SRC_LAYOUT: u8,
252 const DST_LAYOUT: u8,
253 const PRECISION: i32,
254> TransformExecutor<T> for TransformShaperQ2_13OptSse<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
255where
256 u32: AsPrimitive<T>,
257{
258 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
259 unsafe { self.transform_impl(src, dst) }
260 }
261}
262
263#[cfg(feature = "in_place")]
264use crate::InPlaceTransformExecutor;
265
266#[cfg(feature = "in_place")]
267impl<
268 T: Copy + PointeeSizeExpressible + 'static + Default,
269 const SRC_LAYOUT: u8,
270 const DST_LAYOUT: u8,
271 const PRECISION: i32,
272> InPlaceTransformExecutor<T> for TransformShaperQ2_13OptSse<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
273where
274 u32: AsPrimitive<T>,
275{
276 fn transform(&self, in_out: &mut [T]) -> Result<(), CmsError> {
277 unsafe { self.transform_in_place_impl(in_out) }
278 }
279}