Skip to main content

pxfm/sin_cosf/
argument_reduction.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::get_exponent_f32;
30use crate::common::f_fmla;
31use crate::rounding::CpuRound;
32
33#[derive(Debug, Copy, Clone)]
34pub(crate) struct ArgumentReducer {
35    pub(crate) x: f64,
36    pub(crate) x_abs: u32,
37}
38
39impl ArgumentReducer {
40    // Return k and y, where
41    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
42    #[cfg(any(
43        all(
44            any(target_arch = "x86", target_arch = "x86_64"),
45            target_feature = "fma"
46        ),
47        target_arch = "aarch64"
48    ))]
49    #[inline]
50    pub(crate) fn reduce_small(self) -> (f64, i64) {
51        /*
52           Generated in SageMath:
53           z = RealField(300)(32) / RealField(300).pi()
54           n = 53
55           x_hi = RealField(n)(z)  # convert to f64
56           x_mid = RealField(n)(z - RealField(300)(x_hi))
57           x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
58           print(double_to_hex(x_hi), ",")
59           print(double_to_hex(x_mid), ",")
60           print(double_to_hex(x_lo), ",")
61        */
62        const THIRTYTWO_OVER_PI: [u64; 3] =
63            [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
64
65        let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).cpu_round();
66        let mut y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
67        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
68        (y, unsafe {
69            kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
70        })
71    }
72
73    // Return k and y, where
74    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
75    #[cfg(not(any(
76        all(
77            any(target_arch = "x86", target_arch = "x86_64"),
78            target_feature = "fma"
79        ),
80        target_arch = "aarch64"
81    )))]
82    #[inline(always)]
83    pub(crate) fn reduce_small(self) -> (f64, i64) {
84        /*
85           Generated in SageMath:
86           z = RealField(300)(32) / RealField(300).pi()
87           n = 28
88           x_hi = RealField(n)(z)  # convert to f64
89           x_mid = RealField(n)(z - RealField(300)(x_hi))
90           x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
91           print(double_to_hex(x_hi), ",")
92           print(double_to_hex(x_mid), ",")
93           print(double_to_hex(x_lo), ",")
94        */
95        const THIRTYTWO_OVER_PI: [u64; 3] =
96            [0x40245f306e000000, 0xbe3b1bbeae000000, 0x3c63f84eb0000000];
97        let prod = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
98        let kd = prod.cpu_round();
99        let mut y = prod - kd;
100        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
101        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
102        (y, unsafe {
103            kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
104        })
105    }
106
107    #[inline(always)]
108    pub(crate) fn reduce_small_fma(self) -> (f64, i64) {
109        /*
110           Generated in SageMath:
111           z = RealField(300)(32) / RealField(300).pi()
112           n = 53
113           x_hi = RealField(n)(z)  # convert to f64
114           x_mid = RealField(n)(z - RealField(300)(x_hi))
115           x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
116           print(double_to_hex(x_hi), ",")
117           print(double_to_hex(x_mid), ",")
118           print(double_to_hex(x_lo), ",")
119        */
120        const THIRTYTWO_OVER_PI: [u64; 3] =
121            [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
122
123        let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).round();
124        let mut y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
125        y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
126        (y, unsafe {
127            kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
128        })
129    }
130
131    // Return k and y, where
132    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
133    #[cfg(any(
134        all(
135            any(target_arch = "x86", target_arch = "x86_64"),
136            target_feature = "fma"
137        ),
138        target_arch = "aarch64"
139    ))]
140    #[inline]
141    pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
142        /*
143        Generated in SageMath:
144        z = RealField(300)(32) / RealField(300).pi()
145        n = 53
146        x_hi = RealField(n)(z)  # convert to f64
147        x_mid = RealField(n)(z - RealField(300)(x_hi))
148        x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
149        x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
150        x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
151        print(double_to_hex(x_hi), ",")
152        print(double_to_hex(x_mid), ",")
153        print(double_to_hex(x_lo), ",")
154        print(double_to_hex(x_lo_2), ",")
155        print(double_to_hex(x_lo_3), ",")
156         */
157        const THIRTYTWO_OVER_PI: [u64; 5] = [
158            0x40245f306dc9c883,
159            0xbcc6b01ec5417056,
160            0xb966447e493ad4ce,
161            0x360e21c820ff28b2,
162            0xb29508510ea79237,
163        ];
164
165        // 2^45 <= |x| < 2^99
166        if exponent < 99 {
167            // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
168            // contains at least one integral bit <= 2^5.
169            // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
170            // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
171            // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
172            // least 2^6.
173            let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
174            prod_hi = f64::from_bits(
175                prod_hi.to_bits()
176                    & (if exponent < 55 {
177                        0xfffffffffffff000
178                    } else {
179                        0xffffffffffffffff
180                    }),
181            ); // |x| < 2^55
182            let k_hi = prod_hi.cpu_round();
183            let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
184            let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
185            let k_lo = prod_lo.cpu_round();
186            let mut y = f_fmla(
187                self.x,
188                f64::from_bits(THIRTYTWO_OVER_PI[1]),
189                truncated_prod - k_lo,
190            );
191            y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
192            y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
193
194            return (y, k_lo as i64);
195        }
196
197        // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
198        // not contain any of the lowest 6 unit bits, so we can ignore it completely.
199        // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
200        // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
201        // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
202        // least 64.
203        let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
204        prod_hi = f64::from_bits(
205            prod_hi.to_bits()
206                & (if exponent < 110 {
207                    0xfffffffffffff000
208                } else {
209                    0xffffffffffffffff
210                }),
211        ); // |x| < 2^110
212        let k_hi = prod_hi.cpu_round();
213        let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
214        let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
215        let k_lo = prod_lo.cpu_round();
216        let mut y = f_fmla(
217            self.x,
218            f64::from_bits(THIRTYTWO_OVER_PI[2]),
219            truncated_prod - k_lo,
220        );
221        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
222        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
223
224        (y, k_lo as i64)
225    }
226
227    // Return k and y, where
228    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
229    #[inline(always)]
230    #[allow(unused)]
231    pub(crate) fn reduce_large_fma(self, exponent: i32) -> (f64, i64) {
232        /*
233        Generated in SageMath:
234        z = RealField(300)(32) / RealField(300).pi()
235        n = 53
236        x_hi = RealField(n)(z)  # convert to f64
237        x_mid = RealField(n)(z - RealField(300)(x_hi))
238        x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
239        x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
240        x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
241        print(double_to_hex(x_hi), ",")
242        print(double_to_hex(x_mid), ",")
243        print(double_to_hex(x_lo), ",")
244        print(double_to_hex(x_lo_2), ",")
245        print(double_to_hex(x_lo_3), ",")
246         */
247        const THIRTYTWO_OVER_PI: [u64; 5] = [
248            0x40245f306dc9c883,
249            0xbcc6b01ec5417056,
250            0xb966447e493ad4ce,
251            0x360e21c820ff28b2,
252            0xb29508510ea79237,
253        ];
254
255        // 2^45 <= |x| < 2^99
256        if exponent < 99 {
257            // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
258            // contains at least one integral bit <= 2^5.
259            // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
260            // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
261            // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
262            // least 2^6.
263            let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
264            prod_hi = f64::from_bits(
265                prod_hi.to_bits()
266                    & (if exponent < 55 {
267                        0xfffffffffffff000
268                    } else {
269                        0xffffffffffffffff
270                    }),
271            ); // |x| < 2^55
272            let k_hi = prod_hi.round();
273            let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
274            let prod_lo =
275                f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
276            let k_lo = prod_lo.round();
277            let mut y = f64::mul_add(
278                self.x,
279                f64::from_bits(THIRTYTWO_OVER_PI[1]),
280                truncated_prod - k_lo,
281            );
282            y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
283            y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
284
285            return (y, k_lo as i64);
286        }
287
288        // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
289        // not contain any of the lowest 6 unit bits, so we can ignore it completely.
290        // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
291        // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
292        // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
293        // least 64.
294        let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
295        prod_hi = f64::from_bits(
296            prod_hi.to_bits()
297                & (if exponent < 110 {
298                    0xfffffffffffff000
299                } else {
300                    0xffffffffffffffff
301                }),
302        ); // |x| < 2^110
303        let k_hi = prod_hi.round();
304        let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
305        let prod_lo = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
306        let k_lo = prod_lo.round();
307        let mut y = f64::mul_add(
308            self.x,
309            f64::from_bits(THIRTYTWO_OVER_PI[2]),
310            truncated_prod - k_lo,
311        );
312        y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
313        y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
314
315        (y, k_lo as i64)
316    }
317
318    // Return k and y, where
319    // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
320    #[cfg(not(any(
321        all(
322            any(target_arch = "x86", target_arch = "x86_64"),
323            target_feature = "fma"
324        ),
325        target_arch = "aarch64"
326    )))]
327    #[inline]
328    pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
329        // For large range, there are at most 2 parts of THIRTYTWO_OVER_PI_28
330        // contributing to the lowest 6 binary digits (k & 63).  If the least
331        // significant bit of x * the least significant bit of THIRTYTWO_OVER_PI_28[i]
332        // >= 64, we can completely ignore THIRTYTWO_OVER_PI_28[i].
333
334        // Generated by SageMath:
335        // z = RealField(300)(32) / RealField(300).pi()
336        // n = 28
337        // x_hi = RealField(n)(z)  # convert to f64
338        // x_mid = RealField(n)(z - RealField(300)(x_hi))
339        // x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
340        // x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
341        // x_lo_3 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2))
342        // x_lo_4 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3))
343        // x_lo_5 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4))
344        // x_lo_6 = (z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4) - RealField(300)(x_lo_5))
345        //
346        //
347        // print(double_to_hex(x_hi), ",")
348        // print(double_to_hex(x_mid), ",")
349        // print(double_to_hex(x_lo), ",")
350        // print(double_to_hex(x_lo_2), ",")
351        // print(double_to_hex(x_lo_3), ",")
352        // print(double_to_hex(x_lo_4), ",")
353        // print(double_to_hex(x_lo_5), ",")
354        // print(double_to_hex(x_lo_6), ",")
355        static THIRTYTWO_OVER_PI: [u64; 8] = [
356            0x40245f306e000000,
357            0xbe3b1bbeae000000,
358            0x3c63f84eb0000000,
359            0xba87056592000000,
360            0x38bc0db62a000000,
361            0xb6e4cd8778000000,
362            0xb51bef806c000000,
363            0x33363abdebbc561b,
364        ];
365
366        let mut idx = 0;
367        const FRACTION_LEN: i32 = 23;
368        let x_lsb_exp_m4 = exponent - FRACTION_LEN;
369
370        // Exponents of the least significant bits of the corresponding entries in
371        // THIRTYTWO_OVER_PI_28.
372        static THIRTYTWO_OVER_PI_28_LSB_EXP: [i32; 8] =
373            [-24, -55, -81, -114, -143, -170, -200, -230];
374
375        // Skipping the first parts of 32/pi such that:
376        //   LSB of x * LSB of THIRTYTWO_OVER_PI_28[i] >= 32.
377        while x_lsb_exp_m4 + THIRTYTWO_OVER_PI_28_LSB_EXP[idx] > 5 {
378            idx += 1;
379        }
380
381        let prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[idx]);
382        // Get the integral part of x * THIRTYTWO_OVER_PI_28[idx]
383        let k_hi = prod_hi.cpu_round();
384        // Get the fractional part of x * THIRTYTWO_OVER_PI_28[idx]
385        let frac = prod_hi - k_hi;
386        let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 1]), frac);
387        let k_lo = prod_lo.cpu_round();
388
389        // Now y is the fractional parts.
390        let mut y = prod_lo - k_lo;
391        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 2]), y);
392        y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 3]), y);
393
394        (y, (k_hi as i64).wrapping_add(k_lo as i64))
395    }
396
397    #[inline(always)]
398    pub(crate) fn reduce(self) -> (f64, i64) {
399        #[cfg(any(
400            all(
401                any(target_arch = "x86", target_arch = "x86_64"),
402                target_feature = "fma"
403            ),
404            target_arch = "aarch64"
405        ))]
406        const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
407        #[cfg(not(any(
408            all(
409                any(target_arch = "x86", target_arch = "x86_64"),
410                target_feature = "fma"
411            ),
412            target_arch = "aarch64"
413        )))]
414        const SMALL_PASS_BOUND: u32 = 0x4a80_0000u32;
415        if self.x_abs < SMALL_PASS_BOUND {
416            // 2^45
417            self.reduce_small()
418        } else {
419            self.reduce_large(get_exponent_f32(f32::from_bits(self.x_abs)))
420        }
421    }
422
423    #[inline(always)]
424    #[allow(unused)]
425    pub(crate) fn reduce_fma(self) -> (f64, i64) {
426        const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
427        if self.x_abs < SMALL_PASS_BOUND {
428            // 2^45
429            self.reduce_small_fma()
430        } else {
431            self.reduce_large_fma(get_exponent_f32(f32::from_bits(self.x_abs)))
432        }
433    }
434}