pxfm/sin_cosf/argument_reduction.rs
1/*
2 * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1. Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2. Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3. Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::get_exponent_f32;
30use crate::common::f_fmla;
31use crate::rounding::CpuRound;
32
33#[derive(Debug, Copy, Clone)]
34pub(crate) struct ArgumentReducer {
35 pub(crate) x: f64,
36 pub(crate) x_abs: u32,
37}
38
39impl ArgumentReducer {
40 // Return k and y, where
41 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
42 #[cfg(any(
43 all(
44 any(target_arch = "x86", target_arch = "x86_64"),
45 target_feature = "fma"
46 ),
47 target_arch = "aarch64"
48 ))]
49 #[inline]
50 pub(crate) fn reduce_small(self) -> (f64, i64) {
51 /*
52 Generated in SageMath:
53 z = RealField(300)(32) / RealField(300).pi()
54 n = 53
55 x_hi = RealField(n)(z) # convert to f64
56 x_mid = RealField(n)(z - RealField(300)(x_hi))
57 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
58 print(double_to_hex(x_hi), ",")
59 print(double_to_hex(x_mid), ",")
60 print(double_to_hex(x_lo), ",")
61 */
62 const THIRTYTWO_OVER_PI: [u64; 3] =
63 [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
64
65 let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).cpu_round();
66 let mut y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
67 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
68 (y, kd as i64)
69 }
70
71 // Return k and y, where
72 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
73 #[cfg(not(any(
74 all(
75 any(target_arch = "x86", target_arch = "x86_64"),
76 target_feature = "fma"
77 ),
78 target_arch = "aarch64"
79 )))]
80 #[inline(always)]
81 pub(crate) fn reduce_small(self) -> (f64, i64) {
82 /*
83 Generated in SageMath:
84 z = RealField(300)(32) / RealField(300).pi()
85 n = 28
86 x_hi = RealField(n)(z) # convert to f64
87 x_mid = RealField(n)(z - RealField(300)(x_hi))
88 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
89 print(double_to_hex(x_hi), ",")
90 print(double_to_hex(x_mid), ",")
91 print(double_to_hex(x_lo), ",")
92 */
93 const THIRTYTWO_OVER_PI: [u64; 3] =
94 [0x40245f306e000000, 0xbe3b1bbeae000000, 0x3c63f84eb0000000];
95 let prod = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
96 let kd = prod.cpu_round();
97 let mut y = prod - kd;
98 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
99 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
100 (y, unsafe {
101 kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
102 })
103 }
104
105 #[inline(always)]
106 pub(crate) fn reduce_small_fma(self) -> (f64, i64) {
107 /*
108 Generated in SageMath:
109 z = RealField(300)(32) / RealField(300).pi()
110 n = 53
111 x_hi = RealField(n)(z) # convert to f64
112 x_mid = RealField(n)(z - RealField(300)(x_hi))
113 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
114 print(double_to_hex(x_hi), ",")
115 print(double_to_hex(x_mid), ",")
116 print(double_to_hex(x_lo), ",")
117 */
118 const THIRTYTWO_OVER_PI: [u64; 3] =
119 [0x40245f306dc9c883, 0xbcc6b01ec5417056, 0xb966447e493ad4ce];
120
121 let kd = (self.x * f64::from_bits(THIRTYTWO_OVER_PI[0])).round();
122 let mut y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -kd);
123 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), y);
124 (y, unsafe {
125 kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
126 })
127 }
128
129 // Return k and y, where
130 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
131 #[cfg(any(
132 all(
133 any(target_arch = "x86", target_arch = "x86_64"),
134 target_feature = "fma"
135 ),
136 target_arch = "aarch64"
137 ))]
138 #[inline]
139 pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
140 /*
141 Generated in SageMath:
142 z = RealField(300)(32) / RealField(300).pi()
143 n = 53
144 x_hi = RealField(n)(z) # convert to f64
145 x_mid = RealField(n)(z - RealField(300)(x_hi))
146 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
147 x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
148 x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
149 print(double_to_hex(x_hi), ",")
150 print(double_to_hex(x_mid), ",")
151 print(double_to_hex(x_lo), ",")
152 print(double_to_hex(x_lo_2), ",")
153 print(double_to_hex(x_lo_3), ",")
154 */
155 const THIRTYTWO_OVER_PI: [u64; 5] = [
156 0x40245f306dc9c883,
157 0xbcc6b01ec5417056,
158 0xb966447e493ad4ce,
159 0x360e21c820ff28b2,
160 0xb29508510ea79237,
161 ];
162
163 // 2^45 <= |x| < 2^99
164 if exponent < 99 {
165 // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
166 // contains at least one integral bit <= 2^5.
167 // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
168 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
169 // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
170 // least 2^6.
171 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
172 prod_hi = f64::from_bits(
173 prod_hi.to_bits()
174 & (if exponent < 55 {
175 0xfffffffffffff000
176 } else {
177 0xffffffffffffffff
178 }),
179 ); // |x| < 2^55
180 let k_hi = prod_hi.cpu_round();
181 let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
182 let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
183 let k_lo = prod_lo.cpu_round();
184 let mut y = f_fmla(
185 self.x,
186 f64::from_bits(THIRTYTWO_OVER_PI[1]),
187 truncated_prod - k_lo,
188 );
189 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
190 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
191
192 return (y, k_lo as i64);
193 }
194
195 // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
196 // not contain any of the lowest 6 unit bits, so we can ignore it completely.
197 // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
198 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
199 // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
200 // least 64.
201 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
202 prod_hi = f64::from_bits(
203 prod_hi.to_bits()
204 & (if exponent < 110 {
205 0xfffffffffffff000
206 } else {
207 0xffffffffffffffff
208 }),
209 ); // |x| < 2^110
210 let k_hi = prod_hi.cpu_round();
211 let truncated_prod = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
212 let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
213 let k_lo = prod_lo.cpu_round();
214 let mut y = f_fmla(
215 self.x,
216 f64::from_bits(THIRTYTWO_OVER_PI[2]),
217 truncated_prod - k_lo,
218 );
219 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
220 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
221
222 (y, k_lo as i64)
223 }
224
225 // Return k and y, where
226 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
227 #[inline(always)]
228 #[allow(unused)]
229 pub(crate) fn reduce_large_fma(self, exponent: i32) -> (f64, i64) {
230 /*
231 Generated in SageMath:
232 z = RealField(300)(32) / RealField(300).pi()
233 n = 53
234 x_hi = RealField(n)(z) # convert to f64
235 x_mid = RealField(n)(z - RealField(300)(x_hi))
236 x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
237 x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
238 x_lo_3 = z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2)
239 print(double_to_hex(x_hi), ",")
240 print(double_to_hex(x_mid), ",")
241 print(double_to_hex(x_lo), ",")
242 print(double_to_hex(x_lo_2), ",")
243 print(double_to_hex(x_lo_3), ",")
244 */
245 const THIRTYTWO_OVER_PI: [u64; 5] = [
246 0x40245f306dc9c883,
247 0xbcc6b01ec5417056,
248 0xb966447e493ad4ce,
249 0x360e21c820ff28b2,
250 0xb29508510ea79237,
251 ];
252
253 // 2^45 <= |x| < 2^99
254 if exponent < 99 {
255 // - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
256 // contains at least one integral bit <= 2^5.
257 // - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
258 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
259 // - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
260 // least 2^6.
261 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[0]);
262 prod_hi = f64::from_bits(
263 prod_hi.to_bits()
264 & (if exponent < 55 {
265 0xfffffffffffff000
266 } else {
267 0xffffffffffffffff
268 }),
269 ); // |x| < 2^55
270 let k_hi = prod_hi.round();
271 let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[0]), -k_hi);
272 let prod_lo =
273 f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), truncated_prod);
274 let k_lo = prod_lo.round();
275 let mut y = f64::mul_add(
276 self.x,
277 f64::from_bits(THIRTYTWO_OVER_PI[1]),
278 truncated_prod - k_lo,
279 );
280 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), y);
281 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
282
283 return (y, k_lo as i64);
284 }
285
286 // - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
287 // not contain any of the lowest 6 unit bits, so we can ignore it completely.
288 // - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
289 // in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
290 // - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
291 // least 64.
292 let mut prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[1]);
293 prod_hi = f64::from_bits(
294 prod_hi.to_bits()
295 & (if exponent < 110 {
296 0xfffffffffffff000
297 } else {
298 0xffffffffffffffff
299 }),
300 ); // |x| < 2^110
301 let k_hi = prod_hi.round();
302 let truncated_prod = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[1]), -k_hi);
303 let prod_lo = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[2]), truncated_prod);
304 let k_lo = prod_lo.round();
305 let mut y = f64::mul_add(
306 self.x,
307 f64::from_bits(THIRTYTWO_OVER_PI[2]),
308 truncated_prod - k_lo,
309 );
310 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[3]), y);
311 y = f64::mul_add(self.x, f64::from_bits(THIRTYTWO_OVER_PI[4]), y);
312
313 (y, k_lo as i64)
314 }
315
316 // Return k and y, where
317 // k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
318 #[cfg(not(any(
319 all(
320 any(target_arch = "x86", target_arch = "x86_64"),
321 target_feature = "fma"
322 ),
323 target_arch = "aarch64"
324 )))]
325 #[inline]
326 pub(crate) fn reduce_large(self, exponent: i32) -> (f64, i64) {
327 // For large range, there are at most 2 parts of THIRTYTWO_OVER_PI_28
328 // contributing to the lowest 6 binary digits (k & 63). If the least
329 // significant bit of x * the least significant bit of THIRTYTWO_OVER_PI_28[i]
330 // >= 64, we can completely ignore THIRTYTWO_OVER_PI_28[i].
331
332 // Generated by SageMath:
333 // z = RealField(300)(32) / RealField(300).pi()
334 // n = 28
335 // x_hi = RealField(n)(z) # convert to f64
336 // x_mid = RealField(n)(z - RealField(300)(x_hi))
337 // x_lo = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid))
338 // x_lo_2 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo))
339 // x_lo_3 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2))
340 // x_lo_4 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3))
341 // x_lo_5 = RealField(n)(z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4))
342 // x_lo_6 = (z - RealField(300)(x_hi) - RealField(300)(x_mid) - RealField(300)(x_lo) - RealField(300)(x_lo_2) - RealField(300)(x_lo_3) - RealField(300)(x_lo_4) - RealField(300)(x_lo_5))
343 //
344 //
345 // print(double_to_hex(x_hi), ",")
346 // print(double_to_hex(x_mid), ",")
347 // print(double_to_hex(x_lo), ",")
348 // print(double_to_hex(x_lo_2), ",")
349 // print(double_to_hex(x_lo_3), ",")
350 // print(double_to_hex(x_lo_4), ",")
351 // print(double_to_hex(x_lo_5), ",")
352 // print(double_to_hex(x_lo_6), ",")
353 static THIRTYTWO_OVER_PI: [u64; 8] = [
354 0x40245f306e000000,
355 0xbe3b1bbeae000000,
356 0x3c63f84eb0000000,
357 0xba87056592000000,
358 0x38bc0db62a000000,
359 0xb6e4cd8778000000,
360 0xb51bef806c000000,
361 0x33363abdebbc561b,
362 ];
363
364 let mut idx = 0;
365 const FRACTION_LEN: i32 = 23;
366 let x_lsb_exp_m4 = exponent - FRACTION_LEN;
367
368 // Exponents of the least significant bits of the corresponding entries in
369 // THIRTYTWO_OVER_PI_28.
370 static THIRTYTWO_OVER_PI_28_LSB_EXP: [i32; 8] =
371 [-24, -55, -81, -114, -143, -170, -200, -230];
372
373 // Skipping the first parts of 32/pi such that:
374 // LSB of x * LSB of THIRTYTWO_OVER_PI_28[i] >= 32.
375 while x_lsb_exp_m4 + THIRTYTWO_OVER_PI_28_LSB_EXP[idx] > 5 {
376 idx += 1;
377 }
378
379 let prod_hi = self.x * f64::from_bits(THIRTYTWO_OVER_PI[idx]);
380 // Get the integral part of x * THIRTYTWO_OVER_PI_28[idx]
381 let k_hi = prod_hi.cpu_round();
382 // Get the fractional part of x * THIRTYTWO_OVER_PI_28[idx]
383 let frac = prod_hi - k_hi;
384 let prod_lo = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 1]), frac);
385 let k_lo = prod_lo.cpu_round();
386
387 // Now y is the fractional parts.
388 let mut y = prod_lo - k_lo;
389 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 2]), y);
390 y = f_fmla(self.x, f64::from_bits(THIRTYTWO_OVER_PI[idx + 3]), y);
391
392 (y, (k_hi as i64).wrapping_add(k_lo as i64))
393 }
394
395 #[inline(always)]
396 pub(crate) fn reduce(self) -> (f64, i64) {
397 #[cfg(any(
398 all(
399 any(target_arch = "x86", target_arch = "x86_64"),
400 target_feature = "fma"
401 ),
402 target_arch = "aarch64"
403 ))]
404 const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
405 #[cfg(not(any(
406 all(
407 any(target_arch = "x86", target_arch = "x86_64"),
408 target_feature = "fma"
409 ),
410 target_arch = "aarch64"
411 )))]
412 const SMALL_PASS_BOUND: u32 = 0x4a80_0000u32;
413 if self.x_abs < SMALL_PASS_BOUND {
414 // 2^45
415 self.reduce_small()
416 } else {
417 self.reduce_large(get_exponent_f32(f32::from_bits(self.x_abs)))
418 }
419 }
420
421 #[inline(always)]
422 #[allow(unused)]
423 pub(crate) fn reduce_fma(self) -> (f64, i64) {
424 const SMALL_PASS_BOUND: u32 = 0x5600_0000u32;
425 if self.x_abs < SMALL_PASS_BOUND {
426 // 2^45
427 self.reduce_small_fma()
428 } else {
429 self.reduce_large_fma(get_exponent_f32(f32::from_bits(self.x_abs)))
430 }
431 }
432}