Skip to main content

pxfm/sin_cosf/
argument_reduction.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::get_exponent_f32;
30use crate::common::f_fmla;
31use crate::rounding::CpuRound;
32
33#[derive(Debug, Copy, Clone)]
34pub(crate) struct ArgumentReducer {
35    pub(crate) x: f64,
36    pub(crate) x_abs: u32,
37}
38
39impl ArgumentReducer {
40    // Return k and y, where
41    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
42    #[cfg(any(
43        all(
44            any(target_arch = "x86", target_arch = "x86_64"),
45            target_feature = "fma"
46        ),
47        target_arch = "aarch64"
48    ))]
49    #[inline]
50    pub(crate) fn reduce_small(self) -> (f64, i64) {
51        /*
52           Generated in SageMath:
53           z = RealField(300)(32) / RealField(300).pi()
54           n = 53
55           x_hi = RealField(n)(z)  # convert to f64
56           x_mid = RealField(n)(z - RealField(300)(x_hi))
57           x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
58           print(double_to_hex(x_hi), ",")
59           print(double_to_hex(x_mid), ",")
60           print(double_to_hex(x_lo), ",")
61        */
62        const THIRTYTWO_OVER_PI: [u64; 3] =
63            [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
64
65        let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).cpu_round();
66        let mut y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
67        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
68        (y, kd as i64)
69    }
70
71    // Return k and y, where
72    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
73    #[cfg(not(any(
74        all(
75            any(target_arch = "x86", target_arch = "x86_64"),
76            target_feature = "fma"
77        ),
78        target_arch = "aarch64"
79    )))]
80    #[inline(always)]
81    pub(crate) fn reduce_small(self) -> (f64, i64) {
82        /*
83           Generated in SageMath:
84           z = RealField(300)(32) / RealField(300).pi()
85           n = 28
86           x_hi = RealField(n)(z)  # convert to f64
87           x_mid = RealField(n)(z - RealField(300)(x_hi))
88           x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
89           print(double_to_hex(x_hi), ",")
90           print(double_to_hex(x_mid), ",")
91           print(double_to_hex(x_lo), ",")
92        */
93        const THIRTYTWO_OVER_PI: [u64; 3] =
94            [0x40245f306e000000, 0xbe3b1bbeae000000, 0x3c63f84eb0000000];
95        let prod = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
96        let kd = prod.cpu_round();
97        let mut y = prod - kd;
98        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
99        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
100        (y, unsafe {
101            kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
102        })
103    }
104
105    #[inline(always)]
106    pub(crate) fn reduce_small_fma(self) -> (f64, i64) {
107        /*
108           Generated in SageMath:
109           z = RealField(300)(32) / RealField(300).pi()
110           n = 53
111           x_hi = RealField(n)(z)  # convert to f64
112           x_mid = RealField(n)(z - RealField(300)(x_hi))
113           x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
114           print(double_to_hex(x_hi), ",")
115           print(double_to_hex(x_mid), ",")
116           print(double_to_hex(x_lo), ",")
117        */
118        const THIRTYTWO_OVER_PI: [u64; 3] =
119            [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
120
121        let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).round();
122        let mut y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
123        y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
124        (y, unsafe {
125            kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
126        })
127    }
128
129    // Return k and y, where
130    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
131    #[cfg(any(
132        all(
133            any(target_arch = "x86", target_arch = "x86_64"),
134            target_feature = "fma"
135        ),
136        target_arch = "aarch64"
137    ))]
138    #[inline]
139    pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
140        /*
141        Generated in SageMath:
142        z = RealField(300)(32) / RealField(300).pi()
143        n = 53
144        x_hi = RealField(n)(z)  # convert to f64
145        x_mid = RealField(n)(z - RealField(300)(x_hi))
146        x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
147        x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
148        x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
149        print(double_to_hex(x_hi), ",")
150        print(double_to_hex(x_mid), ",")
151        print(double_to_hex(x_lo), ",")
152        print(double_to_hex(x_lo_2), ",")
153        print(double_to_hex(x_lo_3), ",")
154         */
155        const THIRTYTWO_OVER_PI: [u64; 5] = [
156            0x40245f306dc9c883,
157            0xbcc6b01ec5417056,
158            0xb966447e493ad4ce,
159            0x360e21c820ff28b2,
160            0xb29508510ea79237,
161        ];
162
163        // 2^45 <= |x| < 2^99
164        if exponent < 99 {
165            // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
166            // contains at least one integral bit <= 2^5.
167            // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
168            // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
169            // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
170            // least 2^6.
171            let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
172            prod_hi = f64::from_bits(
173                prod_hi.to_bits()
174                    & (if exponent < 55 {
175                        0xfffffffffffff000
176                    } else {
177                        0xffffffffffffffff
178                    }),
179            ); // |x| < 2^55
180            let k_hi = prod_hi.cpu_round();
181            let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
182            let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
183            let k_lo = prod_lo.cpu_round();
184            let mut y = f_fmla(
185                self.x,
186                f64::from_bits(THIRTYTWO_OVER_PI[1]),
187                truncated_prod - k_lo,
188            );
189            y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
190            y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
191
192            return (y, k_lo as i64);
193        }
194
195        // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
196        // not contain any of the lowest 6 unit bits, so we can ignore it completely.
197        // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
198        // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
199        // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
200        // least 64.
201        let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
202        prod_hi = f64::from_bits(
203            prod_hi.to_bits()
204                & (if exponent < 110 {
205                    0xfffffffffffff000
206                } else {
207                    0xffffffffffffffff
208                }),
209        ); // |x| < 2^110
210        let k_hi = prod_hi.cpu_round();
211        let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
212        let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
213        let k_lo = prod_lo.cpu_round();
214        let mut y = f_fmla(
215            self.x,
216            f64::from_bits(THIRTYTWO_OVER_PI[2]),
217            truncated_prod - k_lo,
218        );
219        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
220        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
221
222        (y, k_lo as i64)
223    }
224
225    // Return k and y, where
226    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
227    #[inline(always)]
228    #[allow(unused)]
229    pub(crate) fn reduce_large_fma(self, exponent: i32) -> (f64, i64) {
230        /*
231        Generated in SageMath:
232        z = RealField(300)(32) / RealField(300).pi()
233        n = 53
234        x_hi = RealField(n)(z)  # convert to f64
235        x_mid = RealField(n)(z - RealField(300)(x_hi))
236        x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
237        x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
238        x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
239        print(double_to_hex(x_hi), ",")
240        print(double_to_hex(x_mid), ",")
241        print(double_to_hex(x_lo), ",")
242        print(double_to_hex(x_lo_2), ",")
243        print(double_to_hex(x_lo_3), ",")
244         */
245        const THIRTYTWO_OVER_PI: [u64; 5] = [
246            0x40245f306dc9c883,
247            0xbcc6b01ec5417056,
248            0xb966447e493ad4ce,
249            0x360e21c820ff28b2,
250            0xb29508510ea79237,
251        ];
252
253        // 2^45 <= |x| < 2^99
254        if exponent < 99 {
255            // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
256            // contains at least one integral bit <= 2^5.
257            // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
258            // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
259            // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
260            // least 2^6.
261            let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
262            prod_hi = f64::from_bits(
263                prod_hi.to_bits()
264                    & (if exponent < 55 {
265                        0xfffffffffffff000
266                    } else {
267                        0xffffffffffffffff
268                    }),
269            ); // |x| < 2^55
270            let k_hi = prod_hi.round();
271            let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
272            let prod_lo =
273                f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
274            let k_lo = prod_lo.round();
275            let mut y = f64::mul_add(
276                self.x,
277                f64::from_bits(THIRTYTWO_OVER_PI[1]),
278                truncated_prod - k_lo,
279            );
280            y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
281            y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
282
283            return (y, k_lo as i64);
284        }
285
286        // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
287        // not contain any of the lowest 6 unit bits, so we can ignore it completely.
288        // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
289        // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
290        // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
291        // least 64.
292        let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
293        prod_hi = f64::from_bits(
294            prod_hi.to_bits()
295                & (if exponent < 110 {
296                    0xfffffffffffff000
297                } else {
298                    0xffffffffffffffff
299                }),
300        ); // |x| < 2^110
301        let k_hi = prod_hi.round();
302        let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
303        let prod_lo = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
304        let k_lo = prod_lo.round();
305        let mut y = f64::mul_add(
306            self.x,
307            f64::from_bits(THIRTYTWO_OVER_PI[2]),
308            truncated_prod - k_lo,
309        );
310        y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
311        y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
312
313        (y, k_lo as i64)
314    }
315
316    // Return k and y, where
317    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
318    #[cfg(not(any(
319        all(
320            any(target_arch = "x86", target_arch = "x86_64"),
321            target_feature = "fma"
322        ),
323        target_arch = "aarch64"
324    )))]
325    #[inline]
326    pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
327        // For large range, there are at most 2 parts of THIRTYTWO_OVER_PI_28
328        // contributing to the lowest 6 binary digits (k & 63).  If the least
329        // significant bit of x * the least significant bit of THIRTYTWO_OVER_PI_28[i]
330        // >= 64, we can completely ignore THIRTYTWO_OVER_PI_28[i].
331
332        // Generated by SageMath:
333        // z = RealField(300)(32) / RealField(300).pi()
334        // n = 28
335        // x_hi = RealField(n)(z)  # convert to f64
336        // x_mid = RealField(n)(z - RealField(300)(x_hi))
337        // x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
338        // x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
339        // x_lo_3 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2))
340        // x_lo_4 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3))
341        // x_lo_5 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4))
342        // x_lo_6 = (z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4) - RealField(300)(x_lo_5))
343        //
344        //
345        // print(double_to_hex(x_hi), ",")
346        // print(double_to_hex(x_mid), ",")
347        // print(double_to_hex(x_lo), ",")
348        // print(double_to_hex(x_lo_2), ",")
349        // print(double_to_hex(x_lo_3), ",")
350        // print(double_to_hex(x_lo_4), ",")
351        // print(double_to_hex(x_lo_5), ",")
352        // print(double_to_hex(x_lo_6), ",")
353        static THIRTYTWO_OVER_PI: [u64; 8] = [
354            0x40245f306e000000,
355            0xbe3b1bbeae000000,
356            0x3c63f84eb0000000,
357            0xba87056592000000,
358            0x38bc0db62a000000,
359            0xb6e4cd8778000000,
360            0xb51bef806c000000,
361            0x33363abdebbc561b,
362        ];
363
364        let mut idx = 0;
365        const FRACTION_LEN: i32 = 23;
366        let x_lsb_exp_m4 = exponent - FRACTION_LEN;
367
368        // Exponents of the least significant bits of the corresponding entries in
369        // THIRTYTWO_OVER_PI_28.
370        static THIRTYTWO_OVER_PI_28_LSB_EXP: [i32; 8] =
371            [-24, -55, -81, -114, -143, -170, -200, -230];
372
373        // Skipping the first parts of 32/pi such that:
374        //   LSB of x * LSB of THIRTYTWO_OVER_PI_28[i] >= 32.
375        while x_lsb_exp_m4 + THIRTYTWO_OVER_PI_28_LSB_EXP[idx] > 5 {
376            idx += 1;
377        }
378
379        let prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[idx]);
380        // Get the integral part of x * THIRTYTWO_OVER_PI_28[idx]
381        let k_hi = prod_hi.cpu_round();
382        // Get the fractional part of x * THIRTYTWO_OVER_PI_28[idx]
383        let frac = prod_hi - k_hi;
384        let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 1]), frac);
385        let k_lo = prod_lo.cpu_round();
386
387        // Now y is the fractional parts.
388        let mut y = prod_lo - k_lo;
389        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 2]), y);
390        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 3]), y);
391
392        (y, (k_hi as i64).wrapping_add(k_lo as i64))
393    }
394
395    #[inline(always)]
396    pub(crate) fn reduce(self) -> (f64, i64) {
397        #[cfg(any(
398            all(
399                any(target_arch = "x86", target_arch = "x86_64"),
400                target_feature = "fma"
401            ),
402            target_arch = "aarch64"
403        ))]
404        const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
405        #[cfg(not(any(
406            all(
407                any(target_arch = "x86", target_arch = "x86_64"),
408                target_feature = "fma"
409            ),
410            target_arch = "aarch64"
411        )))]
412        const SMALL_PASS_BOUND: u32 = 0x4a80_0000u32;
413        if self.x_abs < SMALL_PASS_BOUND {
414            // 2^45
415            self.reduce_small()
416        } else {
417            self.reduce_large(get_exponent_f32(f32::from_bits(self.x_abs)))
418        }
419    }
420
421    #[inline(always)]
422    #[allow(unused)]
423    pub(crate) fn reduce_fma(self) -> (f64, i64) {
424        const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
425        if self.x_abs < SMALL_PASS_BOUND {
426            // 2^45
427            self.reduce_small_fma()
428        } else {
429            self.reduce_large_fma(get_exponent_f32(f32::from_bits(self.x_abs)))
430        }
431    }
432}