pxfm/sin_cosf/argument_reduction.rs
1/*
2 * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1. Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2. Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3. Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::get_exponent_f32;
30use crate::common::f_fmla;
31use crate::rounding::CpuRound;
32
33#[derive(Debug, Copy, Clone)]
34pub(crate) struct ArgumentReducer {
35 pub(crate) x: f64,
36 pub(crate) x_abs: u32,
37}
38
39impl ArgumentReducer {
40 // Return k and y, where
41 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
42 #[cfg(any(
43 all(
44 any(target_arch = "x86", target_arch = "x86_64"),
45 target_feature = "fma"
46 ),
47 target_arch = "aarch64"
48 ))]
49 #[inline]
50 pub(crate) fn reduce_small(self) -> (f64, i64) {
51 /*
52 Generated in SageMath:
53 z = RealField(300)(32) / RealField(300).pi()
54 n = 53
55 x_hi = RealField(n)(z) # convert to f64
56 x_mid = RealField(n)(z - RealField(300)(x_hi))
57 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
58 print(double_to_hex(x_hi), ",")
59 print(double_to_hex(x_mid), ",")
60 print(double_to_hex(x_lo), ",")
61 */
62 const THIRTYTWO_OVER_PI: [u64; 3] =
63 [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
64
65 let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).cpu_round();
66 let mut y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
67 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
68 (y, unsafe {
69 kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
70 })
71 }
72
73 // Return k and y, where
74 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
75 #[cfg(not(any(
76 all(
77 any(target_arch = "x86", target_arch = "x86_64"),
78 target_feature = "fma"
79 ),
80 target_arch = "aarch64"
81 )))]
82 #[inline(always)]
83 pub(crate) fn reduce_small(self) -> (f64, i64) {
84 /*
85 Generated in SageMath:
86 z = RealField(300)(32) / RealField(300).pi()
87 n = 28
88 x_hi = RealField(n)(z) # convert to f64
89 x_mid = RealField(n)(z - RealField(300)(x_hi))
90 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
91 print(double_to_hex(x_hi), ",")
92 print(double_to_hex(x_mid), ",")
93 print(double_to_hex(x_lo), ",")
94 */
95 const THIRTYTWO_OVER_PI: [u64; 3] =
96 [0x40245f306e000000, 0xbe3b1bbeae000000, 0x3c63f84eb0000000];
97 let prod = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
98 let kd = prod.cpu_round();
99 let mut y = prod - kd;
100 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
101 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
102 (y, unsafe {
103 kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
104 })
105 }
106
107 #[inline(always)]
108 pub(crate) fn reduce_small_fma(self) -> (f64, i64) {
109 /*
110 Generated in SageMath:
111 z = RealField(300)(32) / RealField(300).pi()
112 n = 53
113 x_hi = RealField(n)(z) # convert to f64
114 x_mid = RealField(n)(z - RealField(300)(x_hi))
115 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
116 print(double_to_hex(x_hi), ",")
117 print(double_to_hex(x_mid), ",")
118 print(double_to_hex(x_lo), ",")
119 */
120 const THIRTYTWO_OVER_PI: [u64; 3] =
121 [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
122
123 let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).round();
124 let mut y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
125 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
126 (y, unsafe {
127 kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
128 })
129 }
130
131 // Return k and y, where
132 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
133 #[cfg(any(
134 all(
135 any(target_arch = "x86", target_arch = "x86_64"),
136 target_feature = "fma"
137 ),
138 target_arch = "aarch64"
139 ))]
140 #[inline]
141 pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
142 /*
143 Generated in SageMath:
144 z = RealField(300)(32) / RealField(300).pi()
145 n = 53
146 x_hi = RealField(n)(z) # convert to f64
147 x_mid = RealField(n)(z - RealField(300)(x_hi))
148 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
149 x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
150 x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
151 print(double_to_hex(x_hi), ",")
152 print(double_to_hex(x_mid), ",")
153 print(double_to_hex(x_lo), ",")
154 print(double_to_hex(x_lo_2), ",")
155 print(double_to_hex(x_lo_3), ",")
156 */
157 const THIRTYTWO_OVER_PI: [u64; 5] = [
158 0x40245f306dc9c883,
159 0xbcc6b01ec5417056,
160 0xb966447e493ad4ce,
161 0x360e21c820ff28b2,
162 0xb29508510ea79237,
163 ];
164
165 // 2^45 <= |x| < 2^99
166 if exponent < 99 {
167 // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
168 // contains at least one integral bit <= 2^5.
169 // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
170 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
171 // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
172 // least 2^6.
173 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
174 prod_hi = f64::from_bits(
175 prod_hi.to_bits()
176 & (if exponent < 55 {
177 0xfffffffffffff000
178 } else {
179 0xffffffffffffffff
180 }),
181 ); // |x| < 2^55
182 let k_hi = prod_hi.cpu_round();
183 let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
184 let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
185 let k_lo = prod_lo.cpu_round();
186 let mut y = f_fmla(
187 self.x,
188 f64::from_bits(THIRTYTWO_OVER_PI[1]),
189 truncated_prod - k_lo,
190 );
191 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
192 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
193
194 return (y, k_lo as i64);
195 }
196
197 // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
198 // not contain any of the lowest 6 unit bits, so we can ignore it completely.
199 // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
200 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
201 // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
202 // least 64.
203 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
204 prod_hi = f64::from_bits(
205 prod_hi.to_bits()
206 & (if exponent < 110 {
207 0xfffffffffffff000
208 } else {
209 0xffffffffffffffff
210 }),
211 ); // |x| < 2^110
212 let k_hi = prod_hi.cpu_round();
213 let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
214 let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
215 let k_lo = prod_lo.cpu_round();
216 let mut y = f_fmla(
217 self.x,
218 f64::from_bits(THIRTYTWO_OVER_PI[2]),
219 truncated_prod - k_lo,
220 );
221 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
222 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
223
224 (y, k_lo as i64)
225 }
226
227 // Return k and y, where
228 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
229 #[inline(always)]
230 #[allow(unused)]
231 pub(crate) fn reduce_large_fma(self, exponent: i32) -> (f64, i64) {
232 /*
233 Generated in SageMath:
234 z = RealField(300)(32) / RealField(300).pi()
235 n = 53
236 x_hi = RealField(n)(z) # convert to f64
237 x_mid = RealField(n)(z - RealField(300)(x_hi))
238 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
239 x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
240 x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
241 print(double_to_hex(x_hi), ",")
242 print(double_to_hex(x_mid), ",")
243 print(double_to_hex(x_lo), ",")
244 print(double_to_hex(x_lo_2), ",")
245 print(double_to_hex(x_lo_3), ",")
246 */
247 const THIRTYTWO_OVER_PI: [u64; 5] = [
248 0x40245f306dc9c883,
249 0xbcc6b01ec5417056,
250 0xb966447e493ad4ce,
251 0x360e21c820ff28b2,
252 0xb29508510ea79237,
253 ];
254
255 // 2^45 <= |x| < 2^99
256 if exponent < 99 {
257 // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
258 // contains at least one integral bit <= 2^5.
259 // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
260 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
261 // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
262 // least 2^6.
263 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
264 prod_hi = f64::from_bits(
265 prod_hi.to_bits()
266 & (if exponent < 55 {
267 0xfffffffffffff000
268 } else {
269 0xffffffffffffffff
270 }),
271 ); // |x| < 2^55
272 let k_hi = prod_hi.round();
273 let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
274 let prod_lo =
275 f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
276 let k_lo = prod_lo.round();
277 let mut y = f64::mul_add(
278 self.x,
279 f64::from_bits(THIRTYTWO_OVER_PI[1]),
280 truncated_prod - k_lo,
281 );
282 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
283 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
284
285 return (y, k_lo as i64);
286 }
287
288 // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
289 // not contain any of the lowest 6 unit bits, so we can ignore it completely.
290 // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
291 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
292 // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
293 // least 64.
294 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
295 prod_hi = f64::from_bits(
296 prod_hi.to_bits()
297 & (if exponent < 110 {
298 0xfffffffffffff000
299 } else {
300 0xffffffffffffffff
301 }),
302 ); // |x| < 2^110
303 let k_hi = prod_hi.round();
304 let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
305 let prod_lo = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
306 let k_lo = prod_lo.round();
307 let mut y = f64::mul_add(
308 self.x,
309 f64::from_bits(THIRTYTWO_OVER_PI[2]),
310 truncated_prod - k_lo,
311 );
312 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
313 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
314
315 (y, k_lo as i64)
316 }
317
318 // Return k and y, where
319 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
320 #[cfg(not(any(
321 all(
322 any(target_arch = "x86", target_arch = "x86_64"),
323 target_feature = "fma"
324 ),
325 target_arch = "aarch64"
326 )))]
327 #[inline]
328 pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
329 // For large range, there are at most 2 parts of THIRTYTWO_OVER_PI_28
330 // contributing to the lowest 6 binary digits (k & 63). If the least
331 // significant bit of x * the least significant bit of THIRTYTWO_OVER_PI_28[i]
332 // >= 64, we can completely ignore THIRTYTWO_OVER_PI_28[i].
333
334 // Generated by SageMath:
335 // z = RealField(300)(32) / RealField(300).pi()
336 // n = 28
337 // x_hi = RealField(n)(z) # convert to f64
338 // x_mid = RealField(n)(z - RealField(300)(x_hi))
339 // x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
340 // x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
341 // x_lo_3 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2))
342 // x_lo_4 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3))
343 // x_lo_5 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4))
344 // x_lo_6 = (z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4) - RealField(300)(x_lo_5))
345 //
346 //
347 // print(double_to_hex(x_hi), ",")
348 // print(double_to_hex(x_mid), ",")
349 // print(double_to_hex(x_lo), ",")
350 // print(double_to_hex(x_lo_2), ",")
351 // print(double_to_hex(x_lo_3), ",")
352 // print(double_to_hex(x_lo_4), ",")
353 // print(double_to_hex(x_lo_5), ",")
354 // print(double_to_hex(x_lo_6), ",")
355 static THIRTYTWO_OVER_PI: [u64; 8] = [
356 0x40245f306e000000,
357 0xbe3b1bbeae000000,
358 0x3c63f84eb0000000,
359 0xba87056592000000,
360 0x38bc0db62a000000,
361 0xb6e4cd8778000000,
362 0xb51bef806c000000,
363 0x33363abdebbc561b,
364 ];
365
366 let mut idx = 0;
367 const FRACTION_LEN: i32 = 23;
368 let x_lsb_exp_m4 = exponent - FRACTION_LEN;
369
370 // Exponents of the least significant bits of the corresponding entries in
371 // THIRTYTWO_OVER_PI_28.
372 static THIRTYTWO_OVER_PI_28_LSB_EXP: [i32; 8] =
373 [-24, -55, -81, -114, -143, -170, -200, -230];
374
375 // Skipping the first parts of 32/pi such that:
376 // LSB of x * LSB of THIRTYTWO_OVER_PI_28[i] >= 32.
377 while x_lsb_exp_m4 + THIRTYTWO_OVER_PI_28_LSB_EXP[idx] > 5 {
378 idx += 1;
379 }
380
381 let prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[idx]);
382 // Get the integral part of x * THIRTYTWO_OVER_PI_28[idx]
383 let k_hi = prod_hi.cpu_round();
384 // Get the fractional part of x * THIRTYTWO_OVER_PI_28[idx]
385 let frac = prod_hi - k_hi;
386 let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 1]), frac);
387 let k_lo = prod_lo.cpu_round();
388
389 // Now y is the fractional parts.
390 let mut y = prod_lo - k_lo;
391 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 2]), y);
392 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 3]), y);
393
394 (y, (k_hi as i64).wrapping_add(k_lo as i64))
395 }
396
397 #[inline(always)]
398 pub(crate) fn reduce(self) -> (f64, i64) {
399 #[cfg(any(
400 all(
401 any(target_arch = "x86", target_arch = "x86_64"),
402 target_feature = "fma"
403 ),
404 target_arch = "aarch64"
405 ))]
406 const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
407 #[cfg(not(any(
408 all(
409 any(target_arch = "x86", target_arch = "x86_64"),
410 target_feature = "fma"
411 ),
412 target_arch = "aarch64"
413 )))]
414 const SMALL_PASS_BOUND: u32 = 0x4a80_0000u32;
415 if self.x_abs < SMALL_PASS_BOUND {
416 // 2^45
417 self.reduce_small()
418 } else {
419 self.reduce_large(get_exponent_f32(f32::from_bits(self.x_abs)))
420 }
421 }
422
423 #[inline(always)]
424 #[allow(unused)]
425 pub(crate) fn reduce_fma(self) -> (f64, i64) {
426 const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
427 if self.x_abs < SMALL_PASS_BOUND {
428 // 2^45
429 self.reduce_small_fma()
430 } else {
431 self.reduce_large_fma(get_exponent_f32(f32::from_bits(self.x_abs)))
432 }
433 }
434}