vello_common/
flatten_simd.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! This is a temporary module that contains a SIMD version of flattening of cubic curves, as
5//! well as some code that was copied from kurbo, which is needed to reimplement the
6//! full `flatten` method.
7
8use crate::flatten::TOL_2;
9#[cfg(not(feature = "std"))]
10use crate::kurbo::common::FloatFuncs as _;
11use crate::kurbo::{CubicBez, ParamCurve, PathEl, Point, QuadBez};
12use alloc::vec;
13use alloc::vec::Vec;
14use bytemuck::{Pod, Zeroable};
15use fearless_simd::*;
16
17// Unlike kurbo, which takes a closure with a callback for outputting the lines, we use a trait
18// instead. The reason is that this way the callback can be inlined, which is not possible with
19// a closure and turned out to have a noticeable overhead.
20pub(crate) trait Callback {
21    fn callback(&mut self, el: PathEl);
22}
23
24/// See the docs for the kurbo implementation of flattening:
25/// <https://docs.rs/kurbo/latest/kurbo/fn.flatten.html>
26///
27/// This version works using a similar approach but using f32x4/f32x8 SIMD instead.
28pub(crate) fn flatten<S: Simd>(
29    simd: S,
30    path: impl IntoIterator<Item = PathEl>,
31    tolerance: f64,
32    callback: &mut impl Callback,
33    flatten_ctx: &mut FlattenCtx,
34) {
35    let mut flattened_cubics = vec![];
36
37    let sqrt_tol = tolerance.sqrt();
38    let mut last_pt = None;
39
40    for el in path {
41        match el {
42            PathEl::MoveTo(p) => {
43                last_pt = Some(p);
44                callback.callback(PathEl::MoveTo(p));
45            }
46            PathEl::LineTo(p) => {
47                last_pt = Some(p);
48                callback.callback(PathEl::LineTo(p));
49            }
50            PathEl::QuadTo(p1, p2) => {
51                if let Some(p0) = last_pt {
52                    // An upper bound on the shortest distance of any point on the quadratic Bezier
53                    // curve to the line segment [p0, p2] is 1/2 of the maximum of the
54                    // endpoint-to-control-point distances.
55                    //
56                    // The derivation is similar to that for the cubic Bezier (see below). In
57                    // short:
58                    //
59                    // q(t) = B0(t) p0 + B1(t) p1 + B2(t) p2
60                    // dist(q(t), [p0, p1]) <= B1(t) dist(p1, [p0, p1])
61                    //                       = 2 (1-t)t dist(p1, [p0, p1]).
62                    //
63                    // The maximum occurs at t=1/2, hence
64                    // max(dist(q(t), [p0, p1] <= 1/2 dist(p1, [p0, p1])).
65                    //
66                    // A cheap upper bound for dist(p1, [p0, p1]) is max(dist(p1, p0), dist(p1, p2)).
67                    //
68                    // The following takes the square to elide the square root of the Euclidean
69                    // distance.
70                    if f64::max((p1 - p0).hypot2(), (p1 - p2).hypot2()) <= 4. * TOL_2 {
71                        callback.callback(PathEl::LineTo(p2));
72                    } else {
73                        let q = QuadBez::new(p0, p1, p2);
74                        let params = q.estimate_subdiv(sqrt_tol);
75                        let n = ((0.5 * params.val / sqrt_tol).ceil() as usize).max(1);
76                        let step = 1.0 / (n as f64);
77                        for i in 1..n {
78                            let u = (i as f64) * step;
79                            let t = q.determine_subdiv_t(&params, u);
80                            let p = q.eval(t);
81                            callback.callback(PathEl::LineTo(p));
82                        }
83                        callback.callback(PathEl::LineTo(p2));
84                    }
85                }
86                last_pt = Some(p2);
87            }
88            PathEl::CurveTo(p1, p2, p3) => {
89                if let Some(p0) = last_pt {
90                    // An upper bound on the shortest distance of any point on the cubic Bezier
91                    // curve to the line segment [p0, p3] is 3/4 of the maximum of the
92                    // endpoint-to-control-point distances.
93                    //
94                    // With Bernstein weights Bi(t), we have
95                    // c(t) = B0(t) p0 + B1(t) p1 + B2(t) p2 + B3(t) p3
96                    // with t from 0 to 1 (inclusive).
97                    //
98                    // Through convexivity of the Euclidean distance function and the line segment,
99                    // we have
100                    // dist(c(t), [p0, p3]) <= B1(t) dist(p1, [p0, p3]) + B2(t) dist(p2, [p0, p3])
101                    //                      <= B1(t) ||p1-p0|| + B2(t) ||p2-p3||
102                    //                      <= (B1(t) + B2(t)) max(||p1-p0||, ||p2-p3|||)
103                    //                       = 3 ((1-t)t^2 + (1-t)^2t) max(||p1-p0||, ||p2-p3||).
104                    //
105                    // The inner polynomial has its maximum of 1/4 at t=1/2, hence
106                    // max(dist(c(t), [p0, p3])) <= 3/4 max(||p1-p0||, ||p2-p3||).
107                    //
108                    // The following takes the square to elide the square root of the Euclidean
109                    // distance.
110                    if f64::max((p0 - p1).hypot2(), (p3 - p2).hypot2()) <= 16. / 9. * TOL_2 {
111                        callback.callback(PathEl::LineTo(p3));
112                    } else {
113                        let c = CubicBez::new(p0, p1, p2, p3);
114                        let max = simd.vectorize(
115                            #[inline(always)]
116                            || {
117                                flatten_cubic_simd(
118                                    simd,
119                                    c,
120                                    flatten_ctx,
121                                    tolerance as f32,
122                                    &mut flattened_cubics,
123                                )
124                            },
125                        );
126
127                        for p in &flattened_cubics[1..max] {
128                            callback.callback(PathEl::LineTo(Point::new(p.x as f64, p.y as f64)));
129                        }
130                    }
131                }
132                last_pt = Some(p3);
133            }
134            PathEl::ClosePath => {
135                last_pt = None;
136                callback.callback(PathEl::ClosePath);
137            }
138        }
139    }
140}
141
142// The below methods are copied from kurbo and needed to implement flattening of normal quad curves.
143
144/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
145///
146/// This is used for flattening curves.
147fn approx_parabola_integral(x: f64) -> f64 {
148    const D: f64 = 0.67;
149    x / (1.0 - D + (D.powi(4) + 0.25 * x * x).sqrt().sqrt())
150}
151
152/// An approximation to the inverse parabola integral.
153fn approx_parabola_inv_integral(x: f64) -> f64 {
154    const B: f64 = 0.39;
155    x * (1.0 - B + (B * B + 0.25 * x * x).sqrt())
156}
157
158impl FlattenParamsExt for QuadBez {
159    fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams {
160        // Determine transformation to $y = x^2$ parabola.
161        let d01 = self.p1 - self.p0;
162        let d12 = self.p2 - self.p1;
163        let dd = d01 - d12;
164        let cross = (self.p2 - self.p0).cross(dd);
165        let x0 = d01.dot(dd) * cross.recip();
166        let x2 = d12.dot(dd) * cross.recip();
167        let scale = (cross / (dd.hypot() * (x2 - x0))).abs();
168
169        // Compute number of subdivisions needed.
170        let a0 = approx_parabola_integral(x0);
171        let a2 = approx_parabola_integral(x2);
172        let val = if scale.is_finite() {
173            let da = (a2 - a0).abs();
174            let sqrt_scale = scale.sqrt();
175            if x0.signum() == x2.signum() {
176                da * sqrt_scale
177            } else {
178                // Handle cusp case (segment contains curvature maximum)
179                let xmin = sqrt_tol / sqrt_scale;
180                sqrt_tol * da / approx_parabola_integral(xmin)
181            }
182        } else {
183            0.0
184        };
185        let u0 = approx_parabola_inv_integral(a0);
186        let u2 = approx_parabola_inv_integral(a2);
187        let uscale = (u2 - u0).recip();
188        FlattenParams {
189            a0,
190            a2,
191            u0,
192            uscale,
193            val,
194        }
195    }
196
197    fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64 {
198        let a = params.a0 + (params.a2 - params.a0) * x;
199        let u = approx_parabola_inv_integral(a);
200        (u - params.u0) * params.uscale
201    }
202}
203
204trait FlattenParamsExt {
205    fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams;
206    fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64;
207}
208
209// Everything below is a SIMD implementation of flattening of cubic curves.
210// It's a combination of https://gist.github.com/raphlinus/5f4e9feb85fd79bafc72da744571ec0e
211// and https://gist.github.com/raphlinus/44e114fef2fd33b889383a60ced0129b.
212
213// TODO(laurenz): Perhaps we should get rid of this in the future and work directly with f32,
214// as it's the only reason we have to pull in proc_macros via the `derive` feature of bytemuck.
215#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
216#[repr(C)]
217struct Point32 {
218    x: f32,
219    y: f32,
220}
221
222struct FlattenParams {
223    a0: f64,
224    a2: f64,
225    u0: f64,
226    uscale: f64,
227    /// The number of `subdivisions * 2 * sqrt_tol`.
228    val: f64,
229}
230
231/// This limit was chosen based on the pre-existing GitHub gist.
232/// This limit should not be hit in normal operation, but _might_ be hit for very large
233/// transforms.
234const MAX_QUADS: usize = 16;
235
236/// The context needed for flattening curves.
237#[derive(Default, Debug)]
238pub struct FlattenCtx {
239    // The +4 is to encourage alignment; might be better to be explicit
240    even_pts: [Point32; MAX_QUADS + 4],
241    odd_pts: [Point32; MAX_QUADS],
242    a0: [f32; MAX_QUADS],
243    da: [f32; MAX_QUADS],
244    u0: [f32; MAX_QUADS],
245    uscale: [f32; MAX_QUADS],
246    val: [f32; MAX_QUADS],
247    n_quads: usize,
248}
249
250#[inline(always)]
251fn is_finite_simd<S: Simd>(x: f32x4<S>) -> mask32x4<S> {
252    let simd = x.simd;
253
254    let x_abs = x.abs();
255    let reinterpreted = u32x4::from_bytes(x_abs.to_bytes());
256    simd.simd_lt_u32x4(reinterpreted, u32x4::splat(simd, 0x7f80_0000))
257}
258
259#[inline(always)]
260fn approx_parabola_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
261    let simd = x.simd;
262
263    const D: f32 = 0.67;
264    const D_POWI_4: f32 = 0.201_511_2;
265
266    let temp1 = f32x8::splat(simd, 0.25).madd(x * x, f32x8::splat(simd, D_POWI_4));
267    let temp2 = temp1.sqrt();
268    let temp3 = temp2.sqrt();
269    let temp4 = f32x8::splat(simd, 1.0) - f32x8::splat(simd, D);
270    let temp5 = temp4 + temp3;
271    x / temp5
272}
273
274#[inline(always)]
275fn approx_parabola_integral_simd_x4<S: Simd>(x: f32x4<S>) -> f32x4<S> {
276    let simd = x.simd;
277
278    const D: f32 = 0.67;
279    const D_POWI_4: f32 = 0.201_511_2;
280
281    let temp1 = f32x4::splat(simd, 0.25).madd(x * x, f32x4::splat(simd, D_POWI_4));
282    let temp2 = temp1.sqrt();
283    let temp3 = temp2.sqrt();
284    let temp4 = f32x4::splat(simd, 1.0) - f32x4::splat(simd, D);
285    let temp5 = temp4 + temp3;
286    x / temp5
287}
288
289#[inline(always)]
290fn approx_parabola_inv_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
291    let simd = x.simd;
292
293    const B: f32 = 0.39;
294    const ONE_MINUS_B: f32 = 1.0 - B;
295
296    let temp1 = f32x8::splat(simd, B * B);
297    let temp2 = f32x8::splat(simd, 0.25).madd(x * x, temp1);
298    let temp3 = temp2.sqrt();
299    let temp4 = f32x8::splat(simd, ONE_MINUS_B) + temp3;
300
301    x * temp4
302}
303
304#[inline(always)]
305fn pt_splat_simd<S: Simd>(simd: S, pt: Point32) -> f32x8<S> {
306    let p_f64: f64 = bytemuck::cast(pt);
307    simd.reinterpret_f32_f64x4(f64x4::splat(simd, p_f64))
308}
309
310#[inline(always)]
311fn eval_simd<S: Simd>(
312    p0: f32x8<S>,
313    p1: f32x8<S>,
314    p2: f32x8<S>,
315    p3: f32x8<S>,
316    t: f32x8<S>,
317) -> f32x8<S> {
318    let mt = 1.0 - t;
319    let im0 = p0 * mt * mt * mt;
320    let im1 = p1 * mt * mt * 3.0;
321    let im2 = p2 * mt * 3.0;
322    let im3 = p3.madd(t, im2) * t;
323
324    (im1 + im3).madd(t, im0)
325}
326
327#[inline(always)]
328fn eval_cubics_simd<S: Simd>(simd: S, c: &CubicBez, n: usize, result: &mut FlattenCtx) {
329    result.n_quads = n;
330    let dt = 0.5 / n as f32;
331
332    // TODO(laurenz): Perhaps we can SIMDify this better
333    let p0p1 = f32x4::from_slice(
334        simd,
335        &[c.p0.x as f32, c.p0.y as f32, c.p1.x as f32, c.p1.y as f32],
336    );
337    let p2p3 = f32x4::from_slice(
338        simd,
339        &[c.p2.x as f32, c.p2.y as f32, c.p3.x as f32, c.p3.y as f32],
340    );
341
342    let split_single = |input: f32x4<S>| {
343        let t1 = simd.reinterpret_f64_f32x4(input);
344        let p0 = simd.zip_low_f64x2(t1, t1);
345        let p1 = simd.zip_high_f64x2(t1, t1);
346
347        let p0 = simd.reinterpret_f32_f64x2(p0);
348        let p1 = simd.reinterpret_f32_f64x2(p1);
349
350        (f32x8::block_splat(p0), f32x8::block_splat(p1))
351    };
352
353    let (p0_128, p1_128) = split_single(p0p1);
354    let (p2_128, p3_128) = split_single(p2p3);
355
356    let iota = f32x8::from_slice(simd, &[0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0]);
357    let step = iota * dt;
358    let mut t = step;
359    let t_inc = f32x8::splat(simd, 4.0 * dt);
360
361    let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.even_pts);
362    let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.odd_pts);
363
364    for i in 0..n.div_ceil(2) {
365        let evaluated = eval_simd(p0_128, p1_128, p2_128, p3_128, t);
366        let (low, high) = simd.split_f32x8(evaluated);
367
368        even_pts[i * 4..][..4].copy_from_slice(&low.val);
369        odd_pts[i * 4..][..4].copy_from_slice(&high.val);
370
371        t += t_inc;
372    }
373
374    even_pts[n * 2..][..8].copy_from_slice(&p3_128.val);
375}
376
377#[inline(always)]
378fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
379    let n = ctx.n_quads;
380
381    let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.even_pts);
382    let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.odd_pts);
383
384    for i in 0..n.div_ceil(4) {
385        let p0 = f32x8::from_slice(simd, &even_pts[i * 8..][..8]);
386        let p_onehalf = f32x8::from_slice(simd, &odd_pts[i * 8..][..8]);
387        let p2 = f32x8::from_slice(simd, &even_pts[(i * 8 + 2)..][..8]);
388        let x = p0 * -0.5;
389        let x1 = p_onehalf.madd(2.0, x);
390        let p1 = p2.madd(-0.5, x1);
391
392        odd_pts[(i * 8)..][..8].copy_from_slice(&p1.val);
393
394        let d01 = p1 - p0;
395        let d12 = p2 - p1;
396        let d01x = simd.unzip_low_f32x8(d01, d01);
397        let d01y = simd.unzip_high_f32x8(d01, d01);
398        let d12x = simd.unzip_low_f32x8(d12, d12);
399        let d12y = simd.unzip_high_f32x8(d12, d12);
400        let ddx = d01x - d12x;
401        let ddy = d01y - d12y;
402        let d02x = d01x + d12x;
403        let d02y = d01y + d12y;
404        // (d02x * ddy) - (d02y * ddx)
405        let cross = ddx.madd(-d02y, d02x * ddy);
406
407        let x0_x2_a = {
408            let (d01x_low, _) = simd.split_f32x8(d01x);
409            let (d12x_low, _) = simd.split_f32x8(d12x);
410
411            simd.combine_f32x4(d12x_low, d01x_low) * ddx
412        };
413        let temp1 = {
414            let (d12y_low, _) = simd.split_f32x8(d12y);
415            let (d01y_low, _) = simd.split_f32x8(d01y);
416
417            simd.combine_f32x4(d12y_low, d01y_low)
418        };
419        let x0_x2_num = temp1.madd(ddy, x0_x2_a);
420        let x0_x2 = x0_x2_num / cross;
421        let (ddx_low, _) = simd.split_f32x8(ddx);
422        let (ddy_low, _) = simd.split_f32x8(ddy);
423        let dd_hypot = ddy_low.madd(ddy_low, ddx_low * ddx_low).sqrt();
424        let (x0, x2) = simd.split_f32x8(x0_x2);
425        let scale_denom = dd_hypot * (x2 - x0);
426        let (temp2, _) = simd.split_f32x8(cross);
427        let scale = (temp2 / scale_denom).abs();
428        let a0_a2 = approx_parabola_integral_simd(x0_x2);
429        let (a0, a2) = simd.split_f32x8(a0_a2);
430        let da = a2 - a0;
431        let da_abs = da.abs();
432        let sqrt_scale = scale.sqrt();
433        let temp3 = simd.or_i32x4(x0.reinterpret_i32(), x2.reinterpret_i32());
434        let mask = simd.simd_ge_i32x4(temp3, i32x4::splat(simd, 0));
435        let noncusp = da_abs * sqrt_scale;
436        // TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
437        let xmin = sqrt_tol / sqrt_scale;
438        let approxint = approx_parabola_integral_simd_x4(xmin);
439        let cusp = (sqrt_tol * da_abs) / approxint;
440        let val_raw = simd.select_f32x4(mask, noncusp, cusp);
441        let finite_mask = is_finite_simd(val_raw);
442        let val = simd.select_f32x4(finite_mask, val_raw, f32x4::splat(simd, 0.0));
443        let u0_u2 = approx_parabola_inv_integral_simd(a0_a2);
444        let (u0, u2) = simd.split_f32x8(u0_u2);
445        let uscale_a = u2 - u0;
446        let uscale = 1.0 / uscale_a;
447
448        ctx.a0[i * 4..][..4].copy_from_slice(&a0.val);
449        ctx.da[i * 4..][..4].copy_from_slice(&da.val);
450        ctx.u0[i * 4..][..4].copy_from_slice(&u0.val);
451        ctx.uscale[i * 4..][..4].copy_from_slice(&uscale.val);
452        ctx.val[i * 4..][..4].copy_from_slice(&val.val);
453    }
454}
455
456#[inline(always)]
457fn output_lines_simd<S: Simd>(
458    simd: S,
459    ctx: &FlattenCtx,
460    i: usize,
461    x0: f32,
462    dx: f32,
463    n: usize,
464    out: &mut [f32],
465) {
466    let p0 = pt_splat_simd(simd, ctx.even_pts[i]);
467    let p1 = pt_splat_simd(simd, ctx.odd_pts[i]);
468    let p2 = pt_splat_simd(simd, ctx.even_pts[i + 1]);
469
470    const IOTA2: [f32; 8] = [0., 0., 1., 1., 2., 2., 3., 3.];
471    let iota2 = f32x8::from_slice(simd, IOTA2.as_ref());
472    let x = iota2.madd(dx, f32x8::splat(simd, x0));
473    let da = f32x8::splat(simd, ctx.da[i]);
474    let mut a = da.madd(x, f32x8::splat(simd, ctx.a0[i]));
475    let a_inc = 4.0 * dx * da;
476    let uscale = f32x8::splat(simd, ctx.uscale[i]);
477
478    for j in 0..n.div_ceil(4) {
479        let u = approx_parabola_inv_integral_simd(a);
480        let t = u.madd(uscale, -ctx.u0[i] * uscale);
481        let mt = 1.0 - t;
482        let z = p0 * mt * mt;
483        let z1 = p1.madd(2.0 * t * mt, z);
484        let p = p2.madd(t * t, z1);
485
486        out[j * 8..][..8].copy_from_slice(&p.val);
487
488        a += a_inc;
489    }
490}
491
492#[inline(always)]
493fn flatten_cubic_simd<S: Simd>(
494    simd: S,
495    c: CubicBez,
496    ctx: &mut FlattenCtx,
497    accuracy: f32,
498    result: &mut Vec<Point32>,
499) -> usize {
500    let n_quads = estimate_num_quads(c, accuracy);
501    eval_cubics_simd(simd, &c, n_quads, ctx);
502    let tol = accuracy * (1.0 - TO_QUAD_TOL);
503    let sqrt_tol = tol.sqrt();
504    estimate_subdiv_simd(simd, sqrt_tol, ctx);
505    let sum: f32 = ctx.val[..n_quads].iter().sum();
506    let n = ((0.5 * sum / sqrt_tol).ceil() as usize).max(1);
507    result.resize(n + 4, Point32::default());
508
509    let step = sum / (n as f32);
510    let step_recip = 1.0 / step;
511    let mut val_sum = 0.0;
512    let mut last_n = 0;
513    let mut x0base = 0.0;
514
515    for i in 0..n_quads {
516        let val = ctx.val[i];
517        val_sum += val;
518        let this_n = val_sum * step_recip;
519        let this_n_next = 1.0 + this_n.floor();
520        let dn = this_n_next as usize - last_n;
521        if dn > 0 {
522            let dx = step / val;
523            let x0 = x0base * dx;
524            output_lines_simd(
525                simd,
526                ctx,
527                i,
528                x0,
529                dx,
530                dn,
531                bytemuck::cast_slice_mut(&mut result[last_n..]),
532            );
533        }
534        x0base = this_n_next - this_n;
535        last_n = this_n_next as usize;
536    }
537
538    result[n] = ctx.even_pts[n_quads];
539
540    n + 1
541}
542
543#[inline(always)]
544fn estimate_num_quads(c: CubicBez, accuracy: f32) -> usize {
545    let q_accuracy = (accuracy * TO_QUAD_TOL) as f64;
546    let max_hypot2 = 432.0 * q_accuracy * q_accuracy;
547    let p1x2 = c.p1.to_vec2() * 3.0 - c.p0.to_vec2();
548    let p2x2 = c.p2.to_vec2() * 3.0 - c.p3.to_vec2();
549    let err = (p2x2 - p1x2).hypot2();
550    let err_div = err / max_hypot2;
551
552    estimate(err_div)
553}
554
555const TO_QUAD_TOL: f32 = 0.1;
556
557#[inline(always)]
558fn estimate(err_div: f64) -> usize {
559    // The original version of this method was:
560    // let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
561    // n_quads.min(MAX_QUADS)
562    //
563    // Note how we always round up and clamp to the range [1, max_quads]. Since we don't
564    // care about the actual fractional value resulting from the powf call we can simply
565    // compute this using a precomputed lookup table evaluating 1^6, 2^6, 3^6, etc. and simply
566    // comparing if the value is less than or equal to each threshold.
567
568    const LUT: [f64; MAX_QUADS] = [
569        1.0, 64.0, 729.0, 4096.0, 15625.0, 46656.0, 117649.0, 262144.0, 531441.0, 1000000.0,
570        1771561.0, 2985984.0, 4826809.0, 7529536.0, 11390625.0, 16777216.0,
571    ];
572
573    #[expect(clippy::needless_range_loop, reason = "better clarity")]
574    for i in 0..MAX_QUADS {
575        if err_div <= LUT[i] {
576            return i + 1;
577        }
578    }
579
580    MAX_QUADS
581}
582
583#[cfg(test)]
584mod tests {
585    use crate::flatten_simd::{MAX_QUADS, estimate};
586
587    fn old_estimate(err_div: f64) -> usize {
588        let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
589        n_quads.min(MAX_QUADS)
590    }
591
592    // Test is disabled by default since it takes 10-20 seconds to run, even in release mode.
593    #[test]
594    #[ignore]
595    fn accuracy() {
596        for i in 0..u32::MAX {
597            let num = f32::from_bits(i);
598
599            if num.is_finite() {
600                assert_eq!(old_estimate(num as f64), estimate(num as f64), "{num}");
601            }
602        }
603    }
604}