Skip to main content

vello_common/
flatten_simd.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! This is a temporary module that contains a SIMD version of flattening of cubic curves, as
5//! well as some code that was copied from kurbo, which is needed to reimplement the
6//! full `flatten` method.
7
8use crate::flatten::TOL_2;
9#[cfg(not(feature = "std"))]
10use crate::kurbo::common::FloatFuncs as _;
11use crate::kurbo::{CubicBez, ParamCurve, PathEl, Point, QuadBez};
12use alloc::vec::Vec;
13use bytemuck::{Pod, Zeroable};
14use fearless_simd::*;
15
16/// The element of a path made of lines.
17///
18/// Each subpath must start with a `MoveTo`. Closing of subpaths is not supported, and subpaths are
19/// not closed implicitly when a new subpath (with `MoveTo`) is started. It is expected that closed
20/// subpaths are watertight in the sense that the last `LineTo` matches exactly with the first
21/// `MoveTo`.
22///
23/// This intentionally allows for non-watertight subpaths, as, e.g., lines that are fully outside
24/// of the viewport do not need to be drawn.
25///
26/// See [`PathEl`] for a more general-purpose path element type.
27pub(crate) enum LinePathEl {
28    MoveTo(Point),
29    LineTo(Point),
30}
31
32// Unlike kurbo, which takes a closure with a callback for outputting the lines, we use a trait
33// instead. The reason is that this way the callback can be inlined, which is not possible with
34// a closure and turned out to have a noticeable overhead.
35pub(crate) trait Callback {
36    fn callback(&mut self, el: LinePathEl);
37}
38
39/// See the docs for the kurbo implementation of flattening:
40/// <https://docs.rs/kurbo/latest/kurbo/fn.flatten.html>
41///
42/// This version works using a similar approach but using f32x4/f32x8 SIMD instead.
43#[inline(always)]
44pub(crate) fn flatten<S: Simd>(
45    simd: S,
46    path: impl IntoIterator<Item = PathEl>,
47    tolerance: f64,
48    callback: &mut impl Callback,
49    flatten_ctx: &mut FlattenCtx,
50) {
51    flatten_ctx.flattened_cubics.clear();
52
53    let sqrt_tol = tolerance.sqrt();
54    let mut closed = true;
55    let mut start_pt = Point::ZERO;
56    let mut last_pt = Point::ZERO;
57
58    for el in path {
59        match el {
60            PathEl::MoveTo(p) => {
61                if !closed && last_pt != start_pt {
62                    callback.callback(LinePathEl::LineTo(start_pt));
63                }
64                closed = false;
65                last_pt = p;
66                start_pt = p;
67                callback.callback(LinePathEl::MoveTo(p));
68            }
69            PathEl::LineTo(p) => {
70                debug_assert!(!closed, "Expected a `MoveTo` before a `LineTo`");
71                last_pt = p;
72                callback.callback(LinePathEl::LineTo(p));
73            }
74            PathEl::QuadTo(p1, p2) => {
75                debug_assert!(!closed, "Expected a `MoveTo` before a `QuadTo`");
76                let p0 = last_pt;
77                // An upper bound on the shortest distance of any point on the quadratic Bezier
78                // curve to the line segment [p0, p2] is 1/2 of the maximum of the
79                // endpoint-to-control-point distances.
80                //
81                // The derivation is similar to that for the cubic Bezier (see below). In
82                // short:
83                //
84                // q}(t) = B0(t) p0 + B1(t) p1 + B2(t) p2
85                // dist(q(t), [p0, p1]) <= B1(t) dist(p1, [p0, p1])
86                //                       = 2 (1-t)t dist(p1, [p0, p1]).
87                //
88                // The maximum occurs at t=1/2, hence
89                // max(dist(q(t), [p0, p1] <= 1/2 dist(p1, [p0, p1])).
90                //
91                // A cheap upper bound for dist(p1, [p0, p1]) is max(dist(p1, p0), dist(p1, p2)).
92                //
93                // The following takes the square to elide the square root of the Euclidean
94                // distance.
95                if f64::max((p1 - p0).hypot2(), (p1 - p2).hypot2()) <= 4. * TOL_2 {
96                    callback.callback(LinePathEl::LineTo(p2));
97                } else {
98                    let q = QuadBez::new(p0, p1, p2);
99                    let params = q.estimate_subdiv(sqrt_tol);
100                    let n = ((0.5 * params.val / sqrt_tol).ceil() as usize).max(1);
101                    let step = 1.0 / (n as f64);
102                    for i in 1..n {
103                        let u = (i as f64) * step;
104                        let t = q.determine_subdiv_t(&params, u);
105                        let p = q.eval(t);
106                        callback.callback(LinePathEl::LineTo(p));
107                    }
108                    callback.callback(LinePathEl::LineTo(p2));
109                }
110                last_pt = p2;
111            }
112            PathEl::CurveTo(p1, p2, p3) => {
113                debug_assert!(!closed, "Expected a `MoveTo` before a `CurveTo`");
114                let p0 = last_pt;
115                // An upper bound on the shortest distance of any point on the cubic Bezier
116                // curve to the line segment [p0, p3] is 3/4 of the maximum of the
117                // endpoint-to-control-point distances.
118                //
119                // With Bernstein weights Bi(t), we have
120                // c(t) = B0(t) p0 + B1(t) p1 + B2(t) p2 + B3(t) p3
121                // with t from 0 to 1 (inclusive).
122                //
123                // Through convexivity of the Euclidean distance function and the line segment,
124                // we have
125                // dist(c(t), [p0, p3]) <= B1(t) dist(p1, [p0, p3]) + B2(t) dist(p2, [p0, p3])
126                //                      <= B1(t) ||p1-p0|| + B2(t) ||p2-p3||
127                //                      <= (B1(t) + B2(t)) max(||p1-p0||, ||p2-p3|||)
128                //                       = 3 ((1-t)t^2 + (1-t)^2t) max(||p1-p0||, ||p2-p3||).
129                //
130                // The inner polynomial has its maximum of 1/4 at t=1/2, hence
131                // max(dist(c(t), [p0, p3])) <= 3/4 max(||p1-p0||, ||p2-p3||).
132                //
133                // The following takes the square to elide the square root of the Euclidean
134                // distance.
135                if f64::max((p0 - p1).hypot2(), (p3 - p2).hypot2()) <= 16. / 9. * TOL_2 {
136                    callback.callback(LinePathEl::LineTo(p3));
137                } else {
138                    let c = CubicBez::new(p0, p1, p2, p3);
139                    let max = flatten_cubic_simd(simd, c, flatten_ctx, tolerance as f32);
140
141                    for p in &flatten_ctx.flattened_cubics[1..max] {
142                        callback.callback(LinePathEl::LineTo(Point::new(p.x as f64, p.y as f64)));
143                    }
144                }
145                last_pt = p3;
146            }
147            PathEl::ClosePath => {
148                closed = true;
149                if last_pt != start_pt {
150                    callback.callback(LinePathEl::LineTo(start_pt));
151                }
152            }
153        }
154    }
155
156    if !closed && last_pt != start_pt {
157        callback.callback(LinePathEl::LineTo(start_pt));
158    }
159}
160
161// The below methods are copied from kurbo and needed to implement flattening of normal quad curves.
162
163/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
164///
165/// This is used for flattening curves.
166fn approx_parabola_integral(x: f64) -> f64 {
167    const D: f64 = 0.67;
168    x / (1.0 - D + (D.powi(4) + 0.25 * x * x).sqrt().sqrt())
169}
170
171/// An approximation to the inverse parabola integral.
172fn approx_parabola_inv_integral(x: f64) -> f64 {
173    const B: f64 = 0.39;
174    x * (1.0 - B + (B * B + 0.25 * x * x).sqrt())
175}
176
177impl FlattenParamsExt for QuadBez {
178    #[inline(always)]
179    fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams {
180        // Determine transformation to $y = x^2$ parabola.
181        let d01 = self.p1 - self.p0;
182        let d12 = self.p2 - self.p1;
183        let dd = d01 - d12;
184        let cross = (self.p2 - self.p0).cross(dd);
185        let x0 = d01.dot(dd) * cross.recip();
186        let x2 = d12.dot(dd) * cross.recip();
187        let scale = (cross / (dd.hypot() * (x2 - x0))).abs();
188
189        // Compute number of subdivisions needed.
190        let a0 = approx_parabola_integral(x0);
191        let a2 = approx_parabola_integral(x2);
192        let val = if scale.is_finite() {
193            let da = (a2 - a0).abs();
194            let sqrt_scale = scale.sqrt();
195            if x0.signum() == x2.signum() {
196                da * sqrt_scale
197            } else {
198                // Handle cusp case (segment contains curvature maximum)
199                let xmin = sqrt_tol / sqrt_scale;
200                sqrt_tol * da / approx_parabola_integral(xmin)
201            }
202        } else {
203            0.0
204        };
205        let u0 = approx_parabola_inv_integral(a0);
206        let u2 = approx_parabola_inv_integral(a2);
207        let uscale = (u2 - u0).recip();
208        FlattenParams {
209            a0,
210            a2,
211            u0,
212            uscale,
213            val,
214        }
215    }
216
217    #[inline(always)]
218    fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64 {
219        let a = params.a0 + (params.a2 - params.a0) * x;
220        let u = approx_parabola_inv_integral(a);
221        (u - params.u0) * params.uscale
222    }
223}
224
225trait FlattenParamsExt {
226    fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams;
227    fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64;
228}
229
230// Everything below is a SIMD implementation of flattening of cubic curves.
231// It's a combination of https://gist.github.com/raphlinus/5f4e9feb85fd79bafc72da744571ec0e
232// and https://gist.github.com/raphlinus/44e114fef2fd33b889383a60ced0129b.
233
234// TODO(laurenz): Perhaps we should get rid of this in the future and work directly with f32,
235// as it's the only reason we have to pull in proc_macros via the `derive` feature of bytemuck.
236#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
237#[repr(C)]
238struct Point32 {
239    x: f32,
240    y: f32,
241}
242
243struct FlattenParams {
244    a0: f64,
245    a2: f64,
246    u0: f64,
247    uscale: f64,
248    /// The number of `subdivisions * 2 * sqrt_tol`.
249    val: f64,
250}
251
252/// This limit was chosen based on the pre-existing GitHub gist.
253/// This limit should not be hit in normal operation, but _might_ be hit for very large
254/// transforms.
255const MAX_QUADS: usize = 16;
256
257/// The context needed for flattening curves.
258#[derive(Default, Debug)]
259pub struct FlattenCtx {
260    // The +4 is to encourage alignment; might be better to be explicit
261    even_pts: [Point32; MAX_QUADS + 4],
262    odd_pts: [Point32; MAX_QUADS],
263    a0: [f32; MAX_QUADS],
264    da: [f32; MAX_QUADS],
265    u0: [f32; MAX_QUADS],
266    uscale: [f32; MAX_QUADS],
267    val: [f32; MAX_QUADS],
268    n_quads: usize,
269    /// Reusable buffer for flattened cubic points.
270    flattened_cubics: Vec<Point32>,
271}
272
273#[inline(always)]
274fn is_finite_simd<S: Simd>(x: f32x4<S>) -> mask32x4<S> {
275    let simd = x.simd;
276
277    let x_abs = x.abs();
278    let reinterpreted = u32x4::from_bytes(x_abs.to_bytes());
279    simd.simd_lt_u32x4(reinterpreted, u32x4::splat(simd, 0x7f80_0000))
280}
281
282#[inline(always)]
283fn approx_parabola_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
284    let simd = x.simd;
285
286    const D: f32 = 0.67;
287    const D_POWI_4: f32 = 0.201_511_2;
288
289    let temp1 = f32x8::splat(simd, 0.25).madd(x * x, f32x8::splat(simd, D_POWI_4));
290    let temp2 = temp1.sqrt();
291    let temp3 = temp2.sqrt();
292    let temp4 = f32x8::splat(simd, 1.0) - f32x8::splat(simd, D);
293    let temp5 = temp4 + temp3;
294    x / temp5
295}
296
297#[inline(always)]
298fn approx_parabola_integral_simd_x4<S: Simd>(x: f32x4<S>) -> f32x4<S> {
299    let simd = x.simd;
300
301    const D: f32 = 0.67;
302    const D_POWI_4: f32 = 0.201_511_2;
303
304    let temp1 = f32x4::splat(simd, 0.25).madd(x * x, f32x4::splat(simd, D_POWI_4));
305    let temp2 = temp1.sqrt();
306    let temp3 = temp2.sqrt();
307    let temp4 = f32x4::splat(simd, 1.0) - f32x4::splat(simd, D);
308    let temp5 = temp4 + temp3;
309    x / temp5
310}
311
312#[inline(always)]
313fn approx_parabola_inv_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
314    let simd = x.simd;
315
316    const B: f32 = 0.39;
317    const ONE_MINUS_B: f32 = 1.0 - B;
318
319    let temp1 = f32x8::splat(simd, B * B);
320    let temp2 = f32x8::splat(simd, 0.25).madd(x * x, temp1);
321    let temp3 = temp2.sqrt();
322    let temp4 = f32x8::splat(simd, ONE_MINUS_B) + temp3;
323
324    x * temp4
325}
326
327#[inline(always)]
328fn pt_splat_simd<S: Simd>(simd: S, pt: Point32) -> f32x8<S> {
329    let p_f64: f64 = bytemuck::cast(pt);
330    simd.reinterpret_f32_f64x4(f64x4::splat(simd, p_f64))
331}
332
333#[inline(always)]
334fn eval_simd<S: Simd>(
335    p0: f32x8<S>,
336    p1: f32x8<S>,
337    p2: f32x8<S>,
338    p3: f32x8<S>,
339    t: f32x8<S>,
340) -> f32x8<S> {
341    let mt = 1.0 - t;
342    let im0 = p0 * mt * mt * mt;
343    let im1 = p1 * mt * mt * 3.0;
344    let im2 = p2 * mt * 3.0;
345    let im3 = p3.madd(t, im2) * t;
346
347    (im1 + im3).madd(t, im0)
348}
349
350#[inline(always)]
351fn eval_cubics_simd<S: Simd>(simd: S, c: &CubicBez, n: usize, result: &mut FlattenCtx) {
352    result.n_quads = n;
353    let dt = 0.5 / n as f32;
354
355    // TODO(laurenz): Perhaps we can SIMDify this better
356    let p0p1 = f32x4::from_slice(
357        simd,
358        &[c.p0.x as f32, c.p0.y as f32, c.p1.x as f32, c.p1.y as f32],
359    );
360    let p2p3 = f32x4::from_slice(
361        simd,
362        &[c.p2.x as f32, c.p2.y as f32, c.p3.x as f32, c.p3.y as f32],
363    );
364
365    let split_single = |input: f32x4<S>| {
366        let t1 = simd.reinterpret_f64_f32x4(input);
367        let p0 = simd.zip_low_f64x2(t1, t1);
368        let p1 = simd.zip_high_f64x2(t1, t1);
369
370        let p0 = simd.reinterpret_f32_f64x2(p0);
371        let p1 = simd.reinterpret_f32_f64x2(p1);
372
373        (f32x8::block_splat(p0), f32x8::block_splat(p1))
374    };
375
376    let (p0_128, p1_128) = split_single(p0p1);
377    let (p2_128, p3_128) = split_single(p2p3);
378
379    let iota = f32x8::from_slice(simd, &[0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0]);
380    let step = iota * dt;
381    let mut t = step;
382    let t_inc = f32x8::splat(simd, 4.0 * dt);
383
384    let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.even_pts);
385    let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.odd_pts);
386
387    for i in 0..n.div_ceil(2) {
388        let evaluated = eval_simd(p0_128, p1_128, p2_128, p3_128, t);
389        let (low, high) = simd.split_f32x8(evaluated);
390
391        even_pts[i * 4..][..4].copy_from_slice(low.as_slice());
392        odd_pts[i * 4..][..4].copy_from_slice(high.as_slice());
393
394        t += t_inc;
395    }
396
397    even_pts[n * 2..][..8].copy_from_slice(p3_128.as_slice());
398}
399
400#[inline(always)]
401fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
402    let n = ctx.n_quads;
403
404    let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.even_pts);
405    let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.odd_pts);
406
407    for i in 0..n.div_ceil(4) {
408        let p0 = f32x8::from_slice(simd, &even_pts[i * 8..][..8]);
409        let p_onehalf = f32x8::from_slice(simd, &odd_pts[i * 8..][..8]);
410        let p2 = f32x8::from_slice(simd, &even_pts[(i * 8 + 2)..][..8]);
411        let x = p0 * -0.5;
412        let x1 = p_onehalf.madd(2.0, x);
413        let p1 = p2.madd(-0.5, x1);
414
415        odd_pts[(i * 8)..][..8].copy_from_slice(p1.as_slice());
416
417        let d01 = p1 - p0;
418        let d12 = p2 - p1;
419        let d01x = simd.unzip_low_f32x8(d01, d01);
420        let d01y = simd.unzip_high_f32x8(d01, d01);
421        let d12x = simd.unzip_low_f32x8(d12, d12);
422        let d12y = simd.unzip_high_f32x8(d12, d12);
423        let ddx = d01x - d12x;
424        let ddy = d01y - d12y;
425        let d02x = d01x + d12x;
426        let d02y = d01y + d12y;
427        // (d02x * ddy) - (d02y * ddx)
428        let cross = ddx.madd(-d02y, d02x * ddy);
429
430        let x0_x2_a = {
431            let (d01x_low, _) = simd.split_f32x8(d01x);
432            let (d12x_low, _) = simd.split_f32x8(d12x);
433
434            simd.combine_f32x4(d12x_low, d01x_low) * ddx
435        };
436        let temp1 = {
437            let (d12y_low, _) = simd.split_f32x8(d12y);
438            let (d01y_low, _) = simd.split_f32x8(d01y);
439
440            simd.combine_f32x4(d12y_low, d01y_low)
441        };
442        let x0_x2_num = temp1.madd(ddy, x0_x2_a);
443        let x0_x2 = x0_x2_num / cross;
444        let (ddx_low, _) = simd.split_f32x8(ddx);
445        let (ddy_low, _) = simd.split_f32x8(ddy);
446        let dd_hypot = ddy_low.madd(ddy_low, ddx_low * ddx_low).sqrt();
447        let (x0, x2) = simd.split_f32x8(x0_x2);
448        let scale_denom = dd_hypot * (x2 - x0);
449        let (temp2, _) = simd.split_f32x8(cross);
450        let scale = (temp2 / scale_denom).abs();
451        let a0_a2 = approx_parabola_integral_simd(x0_x2);
452        let (a0, a2) = simd.split_f32x8(a0_a2);
453        let da = a2 - a0;
454        let da_abs = da.abs();
455        let sqrt_scale = scale.sqrt();
456        let temp3 = simd.or_i32x4(x0.bitcast(), x2.bitcast());
457        let mask = simd.simd_ge_i32x4(temp3, i32x4::splat(simd, 0));
458        let noncusp = da_abs * sqrt_scale;
459        // TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
460        let xmin = sqrt_tol / sqrt_scale;
461        let approxint = approx_parabola_integral_simd_x4(xmin);
462        let cusp = (sqrt_tol * da_abs) / approxint;
463        let val_raw = simd.select_f32x4(mask, noncusp, cusp);
464        let finite_mask = is_finite_simd(val_raw);
465        let val = simd.select_f32x4(finite_mask, val_raw, f32x4::splat(simd, 0.0));
466        let u0_u2 = approx_parabola_inv_integral_simd(a0_a2);
467        let (u0, u2) = simd.split_f32x8(u0_u2);
468        let uscale_a = u2 - u0;
469        let uscale = 1.0 / uscale_a;
470
471        ctx.a0[i * 4..][..4].copy_from_slice(a0.as_slice());
472        ctx.da[i * 4..][..4].copy_from_slice(da.as_slice());
473        ctx.u0[i * 4..][..4].copy_from_slice(u0.as_slice());
474        ctx.uscale[i * 4..][..4].copy_from_slice(uscale.as_slice());
475        ctx.val[i * 4..][..4].copy_from_slice(val.as_slice());
476    }
477}
478
479#[inline(always)]
480fn output_lines_simd<S: Simd>(
481    simd: S,
482    ctx: &mut FlattenCtx,
483    i: usize,
484    x0: f32,
485    dx: f32,
486    n: usize,
487    start_idx: usize,
488) {
489    let p0 = pt_splat_simd(simd, ctx.even_pts[i]);
490    let p1 = pt_splat_simd(simd, ctx.odd_pts[i]);
491    let p2 = pt_splat_simd(simd, ctx.even_pts[i + 1]);
492
493    const IOTA2: [f32; 8] = [0., 0., 1., 1., 2., 2., 3., 3.];
494    let iota2 = f32x8::from_slice(simd, IOTA2.as_ref());
495    let x = iota2.madd(dx, f32x8::splat(simd, x0));
496    let da = f32x8::splat(simd, ctx.da[i]);
497    let mut a = da.madd(x, f32x8::splat(simd, ctx.a0[i]));
498    let a_inc = 4.0 * dx * da;
499    let uscale = f32x8::splat(simd, ctx.uscale[i]);
500
501    let out: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.flattened_cubics[start_idx..]);
502
503    for j in 0..n.div_ceil(4) {
504        let u = approx_parabola_inv_integral_simd(a);
505        let t = u.madd(uscale, -ctx.u0[i] * uscale);
506        let mt = 1.0 - t;
507        let z = p0 * mt * mt;
508        let z1 = p1.madd(2.0 * t * mt, z);
509        let p = p2.madd(t * t, z1);
510
511        out[j * 8..][..8].copy_from_slice(p.as_slice());
512
513        a += a_inc;
514    }
515}
516
517#[inline(always)]
518fn flatten_cubic_simd<S: Simd>(simd: S, c: CubicBez, ctx: &mut FlattenCtx, accuracy: f32) -> usize {
519    let n_quads = estimate_num_quads(c, accuracy);
520    eval_cubics_simd(simd, &c, n_quads, ctx);
521    let tol = accuracy * (1.0 - TO_QUAD_TOL);
522    let sqrt_tol = tol.sqrt();
523    estimate_subdiv_simd(simd, sqrt_tol, ctx);
524    let sum: f32 = ctx.val[..n_quads].iter().sum();
525    let n = ((0.5 * sum / sqrt_tol).ceil() as usize).max(1);
526    ctx.flattened_cubics.resize(n + 4, Point32::default());
527
528    let step = sum / (n as f32);
529    let step_recip = 1.0 / step;
530    let mut val_sum = 0.0;
531    let mut last_n = 0;
532    let mut x0base = 0.0;
533
534    for i in 0..n_quads {
535        let val = ctx.val[i];
536        val_sum += val;
537        let this_n = val_sum * step_recip;
538        let this_n_next = 1.0 + this_n.floor();
539        let dn = this_n_next as usize - last_n;
540        if dn > 0 {
541            let dx = step / val;
542            let x0 = x0base * dx;
543            output_lines_simd(simd, ctx, i, x0, dx, dn, last_n);
544        }
545        x0base = this_n_next - this_n;
546        last_n = this_n_next as usize;
547    }
548
549    ctx.flattened_cubics[n] = ctx.even_pts[n_quads];
550
551    n + 1
552}
553
554#[inline(always)]
555fn estimate_num_quads(c: CubicBez, accuracy: f32) -> usize {
556    let q_accuracy = (accuracy * TO_QUAD_TOL) as f64;
557    let max_hypot2 = 432.0 * q_accuracy * q_accuracy;
558    let p1x2 = c.p1.to_vec2() * 3.0 - c.p0.to_vec2();
559    let p2x2 = c.p2.to_vec2() * 3.0 - c.p3.to_vec2();
560    let err = (p2x2 - p1x2).hypot2();
561    let err_div = err / max_hypot2;
562
563    estimate(err_div)
564}
565
566const TO_QUAD_TOL: f32 = 0.1;
567
568#[inline(always)]
569fn estimate(err_div: f64) -> usize {
570    // The original version of this method was:
571    // let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
572    // n_quads.min(MAX_QUADS)
573    //
574    // Note how we always round up and clamp to the range [1, max_quads]. Since we don't
575    // care about the actual fractional value resulting from the powf call we can simply
576    // compute this using a precomputed lookup table evaluating 1^6, 2^6, 3^6, etc. and simply
577    // comparing if the value is less than or equal to each threshold.
578
579    const LUT: [f64; MAX_QUADS] = [
580        1.0, 64.0, 729.0, 4096.0, 15625.0, 46656.0, 117649.0, 262144.0, 531441.0, 1000000.0,
581        1771561.0, 2985984.0, 4826809.0, 7529536.0, 11390625.0, 16777216.0,
582    ];
583
584    #[expect(clippy::needless_range_loop, reason = "better clarity")]
585    for i in 0..MAX_QUADS {
586        if err_div <= LUT[i] {
587            return i + 1;
588        }
589    }
590
591    MAX_QUADS
592}
593
594#[cfg(test)]
595mod tests {
596    use crate::flatten_simd::{MAX_QUADS, estimate};
597
598    fn old_estimate(err_div: f64) -> usize {
599        let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
600        n_quads.min(MAX_QUADS)
601    }
602
603    // Test is disabled by default since it takes 10-20 seconds to run, even in release mode.
604    #[test]
605    #[ignore]
606    fn accuracy() {
607        for i in 0..u32::MAX {
608            let num = f32::from_bits(i);
609
610            if num.is_finite() {
611                assert_eq!(old_estimate(num as f64), estimate(num as f64), "{num}");
612            }
613        }
614    }
615}