half/binary16/
arch.rs

1#![allow(dead_code, unused_imports)]
2use crate::leading_zeros::leading_zeros_u16;
3use core::mem;
4
5#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6mod x86;
7
8#[cfg(target_arch = "aarch64")]
9mod aarch64;
10
11#[cfg(all(feature = "nightly", target_arch = "loongarch64"))]
12mod loongarch64;
13
14macro_rules! convert_fn {
15    (if x86_feature("f16c") { $f16c:expr }
16    else if aarch64_feature("fp16") { $aarch64:expr }
17    else if loongarch64_feature("lsx") { $loongarch64:expr }
18    else { $fallback:expr }) => {
19        cfg_if::cfg_if! {
20            // Use intrinsics directly when a compile target or using no_std
21            if #[cfg(all(
22                any(target_arch = "x86", target_arch = "x86_64"),
23                target_feature = "f16c"
24            ))] {
25                $f16c
26            }
27            else if #[cfg(all(
28                target_arch = "aarch64",
29                target_feature = "fp16"
30            ))] {
31                $aarch64
32            }
33            else if #[cfg(all(
34                feature = "nightly",
35                target_arch = "loongarch64",
36                target_feature = "lsx"
37            ))] {
38                $loongarch64
39            }
40
41            // Use CPU feature detection if using std
42            else if #[cfg(all(
43                feature = "std",
44                any(target_arch = "x86", target_arch = "x86_64")
45            ))] {
46                use std::arch::is_x86_feature_detected;
47                if is_x86_feature_detected!("f16c") {
48                    $f16c
49                } else {
50                    $fallback
51                }
52            }
53            else if #[cfg(all(
54                feature = "std",
55                target_arch = "aarch64",
56            ))] {
57                use std::arch::is_aarch64_feature_detected;
58                if is_aarch64_feature_detected!("fp16") {
59                    $aarch64
60                } else {
61                    $fallback
62                }
63            }
64            else if #[cfg(all(
65                feature = "std",
66                feature = "nightly",
67                target_arch = "loongarch64",
68            ))] {
69                use std::arch::is_loongarch_feature_detected;
70                if is_loongarch_feature_detected!("lsx") {
71                    $loongarch64
72                } else {
73                    $fallback
74                }
75            }
76
77            // Fallback to software
78            else {
79                $fallback
80            }
81        }
82    };
83}
84
85#[inline]
86pub(crate) fn f32_to_f16(f: f32) -> u16 {
87    convert_fn! {
88        if x86_feature("f16c") {
89            unsafe { x86::f32_to_f16_x86_f16c(f) }
90        } else if aarch64_feature("fp16") {
91            unsafe { aarch64::f32_to_f16_fp16(f) }
92        } else if loongarch64_feature("lsx") {
93            unsafe { loongarch64::f32_to_f16_lsx(f) }
94        } else {
95            f32_to_f16_fallback(f)
96        }
97    }
98}
99
100#[inline]
101pub(crate) fn f64_to_f16(f: f64) -> u16 {
102    convert_fn! {
103        if x86_feature("f16c") {
104            unsafe { x86::f32_to_f16_x86_f16c(f as f32) }
105        } else if aarch64_feature("fp16") {
106            unsafe { aarch64::f64_to_f16_fp16(f) }
107        } else if loongarch64_feature("lsx") {
108            f64_to_f16_fallback(f)
109        } else {
110            f64_to_f16_fallback(f)
111        }
112    }
113}
114
115#[inline]
116pub(crate) fn f16_to_f32(i: u16) -> f32 {
117    convert_fn! {
118        if x86_feature("f16c") {
119            unsafe { x86::f16_to_f32_x86_f16c(i) }
120        } else if aarch64_feature("fp16") {
121            unsafe { aarch64::f16_to_f32_fp16(i) }
122        } else if loongarch64_feature("lsx") {
123            unsafe { loongarch64::f16_to_f32_lsx(i) }
124        } else {
125            f16_to_f32_fallback(i)
126        }
127    }
128}
129
130#[inline]
131pub(crate) fn f16_to_f64(i: u16) -> f64 {
132    convert_fn! {
133        if x86_feature("f16c") {
134            unsafe { x86::f16_to_f32_x86_f16c(i) as f64 }
135        } else if aarch64_feature("fp16") {
136            unsafe { aarch64::f16_to_f64_fp16(i) }
137        } else if loongarch64_feature("lsx") {
138            unsafe { loongarch64::f16_to_f32_lsx(i) as f64 }
139        } else {
140            f16_to_f64_fallback(i)
141        }
142    }
143}
144
145#[inline]
146pub(crate) fn f32x4_to_f16x4(f: &[f32; 4]) -> [u16; 4] {
147    convert_fn! {
148        if x86_feature("f16c") {
149            unsafe { x86::f32x4_to_f16x4_x86_f16c(f) }
150        } else if aarch64_feature("fp16") {
151            unsafe { aarch64::f32x4_to_f16x4_fp16(f) }
152        } else if loongarch64_feature("lsx") {
153            unsafe { loongarch64::f32x4_to_f16x4_lsx(f) }
154        } else {
155            f32x4_to_f16x4_fallback(f)
156        }
157    }
158}
159
160#[inline]
161pub(crate) fn f16x4_to_f32x4(i: &[u16; 4]) -> [f32; 4] {
162    convert_fn! {
163        if x86_feature("f16c") {
164            unsafe { x86::f16x4_to_f32x4_x86_f16c(i) }
165        } else if aarch64_feature("fp16") {
166            unsafe { aarch64::f16x4_to_f32x4_fp16(i) }
167        } else if loongarch64_feature("lsx") {
168            unsafe { loongarch64::f16x4_to_f32x4_lsx(i) }
169        } else {
170            f16x4_to_f32x4_fallback(i)
171        }
172    }
173}
174
175#[inline]
176pub(crate) fn f64x4_to_f16x4(f: &[f64; 4]) -> [u16; 4] {
177    convert_fn! {
178        if x86_feature("f16c") {
179            unsafe { x86::f64x4_to_f16x4_x86_f16c(f) }
180        } else if aarch64_feature("fp16") {
181            unsafe { aarch64::f64x4_to_f16x4_fp16(f) }
182        } else if loongarch64_feature("lsx") {
183            unsafe { loongarch64::f64x4_to_f16x4_lsx(f) }
184        } else {
185            f64x4_to_f16x4_fallback(f)
186        }
187    }
188}
189
190#[inline]
191pub(crate) fn f16x4_to_f64x4(i: &[u16; 4]) -> [f64; 4] {
192    convert_fn! {
193        if x86_feature("f16c") {
194            unsafe { x86::f16x4_to_f64x4_x86_f16c(i) }
195        } else if aarch64_feature("fp16") {
196            unsafe { aarch64::f16x4_to_f64x4_fp16(i) }
197        } else if loongarch64_feature("lsx") {
198            unsafe { loongarch64::f16x4_to_f64x4_lsx(i) }
199        } else {
200            f16x4_to_f64x4_fallback(i)
201        }
202    }
203}
204
205#[inline]
206pub(crate) fn f32x8_to_f16x8(f: &[f32; 8]) -> [u16; 8] {
207    convert_fn! {
208        if x86_feature("f16c") {
209            unsafe { x86::f32x8_to_f16x8_x86_f16c(f) }
210        } else if aarch64_feature("fp16") {
211            {
212                let mut result = [0u16; 8];
213                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
214                    aarch64::f32x4_to_f16x4_fp16);
215                result
216            }
217        } else if loongarch64_feature("lsx") {
218            {
219                let mut result = [0u16; 8];
220                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
221                    loongarch64::f32x4_to_f16x4_lsx);
222                result
223            }
224        } else {
225            f32x8_to_f16x8_fallback(f)
226        }
227    }
228}
229
230#[inline]
231pub(crate) fn f16x8_to_f32x8(i: &[u16; 8]) -> [f32; 8] {
232    convert_fn! {
233        if x86_feature("f16c") {
234            unsafe { x86::f16x8_to_f32x8_x86_f16c(i) }
235        } else if aarch64_feature("fp16") {
236            {
237                let mut result = [0f32; 8];
238                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
239                    aarch64::f16x4_to_f32x4_fp16);
240                result
241            }
242        } else if loongarch64_feature("lsx") {
243            {
244                let mut result = [0f32; 8];
245                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
246                    loongarch64::f16x4_to_f32x4_lsx);
247                result
248            }
249        } else {
250            f16x8_to_f32x8_fallback(i)
251        }
252    }
253}
254
255#[inline]
256pub(crate) fn f64x8_to_f16x8(f: &[f64; 8]) -> [u16; 8] {
257    convert_fn! {
258        if x86_feature("f16c") {
259            unsafe { x86::f64x8_to_f16x8_x86_f16c(f) }
260        } else if aarch64_feature("fp16") {
261            {
262                let mut result = [0u16; 8];
263                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
264                    aarch64::f64x4_to_f16x4_fp16);
265                result
266            }
267        } else if loongarch64_feature("lsx") {
268            {
269                let mut result = [0u16; 8];
270                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
271                    loongarch64::f64x4_to_f16x4_lsx);
272                result
273            }
274        } else {
275            f64x8_to_f16x8_fallback(f)
276        }
277    }
278}
279
280#[inline]
281pub(crate) fn f16x8_to_f64x8(i: &[u16; 8]) -> [f64; 8] {
282    convert_fn! {
283        if x86_feature("f16c") {
284            unsafe { x86::f16x8_to_f64x8_x86_f16c(i) }
285        } else if aarch64_feature("fp16") {
286            {
287                let mut result = [0f64; 8];
288                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
289                    aarch64::f16x4_to_f64x4_fp16);
290                result
291            }
292        } else if loongarch64_feature("lsx") {
293            {
294                let mut result = [0f64; 8];
295                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
296                    loongarch64::f16x4_to_f64x4_lsx);
297                result
298            }
299        } else {
300            f16x8_to_f64x8_fallback(i)
301        }
302    }
303}
304
305#[inline]
306pub(crate) fn f32_to_f16_slice(src: &[f32], dst: &mut [u16]) {
307    convert_fn! {
308        if x86_feature("f16c") {
309            convert_chunked_slice_8(src, dst, x86::f32x8_to_f16x8_x86_f16c,
310                x86::f32x4_to_f16x4_x86_f16c)
311        } else if aarch64_feature("fp16") {
312            convert_chunked_slice_4(src, dst, aarch64::f32x4_to_f16x4_fp16)
313        } else if loongarch64_feature("lsx") {
314            convert_chunked_slice_4(src, dst, loongarch64::f32x4_to_f16x4_lsx)
315        } else {
316            slice_fallback(src, dst, f32_to_f16_fallback)
317        }
318    }
319}
320
321#[inline]
322pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) {
323    convert_fn! {
324        if x86_feature("f16c") {
325            convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c,
326                x86::f16x4_to_f32x4_x86_f16c)
327        } else if aarch64_feature("fp16") {
328            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16)
329        } else if loongarch64_feature("lsx") {
330            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f32x4_lsx)
331        } else {
332            slice_fallback(src, dst, f16_to_f32_fallback)
333        }
334    }
335}
336
337#[inline]
338pub(crate) fn f64_to_f16_slice(src: &[f64], dst: &mut [u16]) {
339    convert_fn! {
340        if x86_feature("f16c") {
341            convert_chunked_slice_8(src, dst, x86::f64x8_to_f16x8_x86_f16c,
342                x86::f64x4_to_f16x4_x86_f16c)
343        } else if aarch64_feature("fp16") {
344            convert_chunked_slice_4(src, dst, aarch64::f64x4_to_f16x4_fp16)
345        } else if loongarch64_feature("lsx") {
346            convert_chunked_slice_4(src, dst, loongarch64::f64x4_to_f16x4_lsx)
347        } else {
348            slice_fallback(src, dst, f64_to_f16_fallback)
349        }
350    }
351}
352
353#[inline]
354pub(crate) fn f16_to_f64_slice(src: &[u16], dst: &mut [f64]) {
355    convert_fn! {
356        if x86_feature("f16c") {
357            convert_chunked_slice_8(src, dst, x86::f16x8_to_f64x8_x86_f16c,
358                x86::f16x4_to_f64x4_x86_f16c)
359        } else if aarch64_feature("fp16") {
360            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f64x4_fp16)
361        } else if loongarch64_feature("lsx") {
362            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f64x4_lsx)
363        } else {
364            slice_fallback(src, dst, f16_to_f64_fallback)
365        }
366    }
367}
368
369macro_rules! math_fn {
370    (if aarch64_feature("fp16") { $aarch64:expr }
371    else { $fallback:expr }) => {
372        cfg_if::cfg_if! {
373            // Use intrinsics directly when a compile target or using no_std
374            if #[cfg(all(
375                target_arch = "aarch64",
376                target_feature = "fp16"
377            ))] {
378                $aarch64
379            }
380
381            // Use CPU feature detection if using std
382            else if #[cfg(all(
383                feature = "std",
384                target_arch = "aarch64",
385                not(target_feature = "fp16")
386            ))] {
387                use std::arch::is_aarch64_feature_detected;
388                if is_aarch64_feature_detected!("fp16") {
389                    $aarch64
390                } else {
391                    $fallback
392                }
393            }
394
395            // Fallback to software
396            else {
397                $fallback
398            }
399        }
400    };
401}
402
403#[inline]
404pub(crate) fn add_f16(a: u16, b: u16) -> u16 {
405    math_fn! {
406        if aarch64_feature("fp16") {
407            unsafe { aarch64::add_f16_fp16(a, b) }
408        } else {
409            add_f16_fallback(a, b)
410        }
411    }
412}
413
414#[inline]
415pub(crate) fn subtract_f16(a: u16, b: u16) -> u16 {
416    math_fn! {
417        if aarch64_feature("fp16") {
418            unsafe { aarch64::subtract_f16_fp16(a, b) }
419        } else {
420            subtract_f16_fallback(a, b)
421        }
422    }
423}
424
425#[inline]
426pub(crate) fn multiply_f16(a: u16, b: u16) -> u16 {
427    math_fn! {
428        if aarch64_feature("fp16") {
429            unsafe { aarch64::multiply_f16_fp16(a, b) }
430        } else {
431            multiply_f16_fallback(a, b)
432        }
433    }
434}
435
436#[inline]
437pub(crate) fn divide_f16(a: u16, b: u16) -> u16 {
438    math_fn! {
439        if aarch64_feature("fp16") {
440            unsafe { aarch64::divide_f16_fp16(a, b) }
441        } else {
442            divide_f16_fallback(a, b)
443        }
444    }
445}
446
447#[inline]
448pub(crate) fn remainder_f16(a: u16, b: u16) -> u16 {
449    remainder_f16_fallback(a, b)
450}
451
452#[inline]
453pub(crate) fn product_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
454    math_fn! {
455        if aarch64_feature("fp16") {
456            iter.fold(0, |acc, x| unsafe { aarch64::multiply_f16_fp16(acc, x) })
457        } else {
458            product_f16_fallback(iter)
459        }
460    }
461}
462
463#[inline]
464pub(crate) fn sum_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
465    math_fn! {
466        if aarch64_feature("fp16") {
467            iter.fold(0, |acc, x| unsafe { aarch64::add_f16_fp16(acc, x) })
468        } else {
469            sum_f16_fallback(iter)
470        }
471    }
472}
473
474/// Chunks sliced into x8 or x4 arrays
475#[inline]
476fn convert_chunked_slice_8<S: Copy + Default, D: Copy>(
477    src: &[S],
478    dst: &mut [D],
479    fn8: unsafe fn(&[S; 8]) -> [D; 8],
480    fn4: unsafe fn(&[S; 4]) -> [D; 4],
481) {
482    assert_eq!(src.len(), dst.len());
483
484    // TODO: Can be further optimized with array_chunks when it becomes stabilized
485
486    let src_chunks = src.chunks_exact(8);
487    let mut dst_chunks = dst.chunks_exact_mut(8);
488    let src_remainder = src_chunks.remainder();
489    for (s, d) in src_chunks.zip(&mut dst_chunks) {
490        let chunk: &[S; 8] = s.try_into().unwrap();
491        d.copy_from_slice(unsafe { &fn8(chunk) });
492    }
493
494    // Process remainder
495    if src_remainder.len() > 4 {
496        let mut buf: [S; 8] = Default::default();
497        buf[..src_remainder.len()].copy_from_slice(src_remainder);
498        let vec = unsafe { fn8(&buf) };
499        let dst_remainder = dst_chunks.into_remainder();
500        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
501    } else if !src_remainder.is_empty() {
502        let mut buf: [S; 4] = Default::default();
503        buf[..src_remainder.len()].copy_from_slice(src_remainder);
504        let vec = unsafe { fn4(&buf) };
505        let dst_remainder = dst_chunks.into_remainder();
506        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
507    }
508}
509
510/// Chunks sliced into x4 arrays
511#[inline]
512fn convert_chunked_slice_4<S: Copy + Default, D: Copy>(
513    src: &[S],
514    dst: &mut [D],
515    f: unsafe fn(&[S; 4]) -> [D; 4],
516) {
517    assert_eq!(src.len(), dst.len());
518
519    // TODO: Can be further optimized with array_chunks when it becomes stabilized
520
521    let src_chunks = src.chunks_exact(4);
522    let mut dst_chunks = dst.chunks_exact_mut(4);
523    let src_remainder = src_chunks.remainder();
524    for (s, d) in src_chunks.zip(&mut dst_chunks) {
525        let chunk: &[S; 4] = s.try_into().unwrap();
526        d.copy_from_slice(unsafe { &f(chunk) });
527    }
528
529    // Process remainder
530    if !src_remainder.is_empty() {
531        let mut buf: [S; 4] = Default::default();
532        buf[..src_remainder.len()].copy_from_slice(src_remainder);
533        let vec = unsafe { f(&buf) };
534        let dst_remainder = dst_chunks.into_remainder();
535        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
536    }
537}
538
539/////////////// Fallbacks ////////////////
540
541// In the below functions, round to nearest, with ties to even.
542// Let us call the most significant bit that will be shifted out the round_bit.
543//
544// Round up if either
545//  a) Removed part > tie.
546//     (mantissa & round_bit) != 0 && (mantissa & (round_bit - 1)) != 0
547//  b) Removed part == tie, and retained part is odd.
548//     (mantissa & round_bit) != 0 && (mantissa & (2 * round_bit)) != 0
549// (If removed part == tie and retained part is even, do not round up.)
550// These two conditions can be combined into one:
551//     (mantissa & round_bit) != 0 && (mantissa & ((round_bit - 1) | (2 * round_bit))) != 0
552// which can be simplified into
553//     (mantissa & round_bit) != 0 && (mantissa & (3 * round_bit - 1)) != 0
554
555#[inline]
556pub(crate) const fn f32_to_f16_fallback(value: f32) -> u16 {
557    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
558    // Convert to raw bytes
559    let x: u32 = unsafe { mem::transmute::<f32, u32>(value) };
560
561    // Extract IEEE754 components
562    let sign = x & 0x8000_0000u32;
563    let exp = x & 0x7F80_0000u32;
564    let man = x & 0x007F_FFFFu32;
565
566    // Check for all exponent bits being set, which is Infinity or NaN
567    if exp == 0x7F80_0000u32 {
568        // Set mantissa MSB for NaN (and also keep shifted mantissa bits)
569        let nan_bit = if man == 0 { 0 } else { 0x0200u32 };
570        return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 13)) as u16;
571    }
572
573    // The number is normalized, start assembling half precision version
574    let half_sign = sign >> 16;
575    // Unbias the exponent, then bias for half precision
576    let unbiased_exp = ((exp >> 23) as i32) - 127;
577    let half_exp = unbiased_exp + 15;
578
579    // Check for exponent overflow, return +infinity
580    if half_exp >= 0x1F {
581        return (half_sign | 0x7C00u32) as u16;
582    }
583
584    // Check for underflow
585    if half_exp <= 0 {
586        // Check mantissa for what we can do
587        if 14 - half_exp > 24 {
588            // No rounding possibility, so this is a full underflow, return signed zero
589            return half_sign as u16;
590        }
591        // Don't forget about hidden leading mantissa bit when assembling mantissa
592        let man = man | 0x0080_0000u32;
593        let mut half_man = man >> (14 - half_exp);
594        // Check for rounding (see comment above functions)
595        let round_bit = 1 << (13 - half_exp);
596        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
597            half_man += 1;
598        }
599        // No exponent for subnormals
600        return (half_sign | half_man) as u16;
601    }
602
603    // Rebias the exponent
604    let half_exp = (half_exp as u32) << 10;
605    let half_man = man >> 13;
606    // Check for rounding (see comment above functions)
607    let round_bit = 0x0000_1000u32;
608    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
609        // Round it
610        ((half_sign | half_exp | half_man) + 1) as u16
611    } else {
612        (half_sign | half_exp | half_man) as u16
613    }
614}
615
616#[inline]
617pub(crate) const fn f64_to_f16_fallback(value: f64) -> u16 {
618    // Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always
619    // be lost on half-precision.
620    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
621    let val: u64 = unsafe { mem::transmute::<f64, u64>(value) };
622    let x = (val >> 32) as u32;
623
624    // Extract IEEE754 components
625    let sign = x & 0x8000_0000u32;
626    let exp = x & 0x7FF0_0000u32;
627    let man = x & 0x000F_FFFFu32;
628
629    // Check for all exponent bits being set, which is Infinity or NaN
630    if exp == 0x7FF0_0000u32 {
631        // Set mantissa MSB for NaN (and also keep shifted mantissa bits).
632        // We also have to check the last 32 bits.
633        let nan_bit = if man == 0 && (val as u32 == 0) {
634            0
635        } else {
636            0x0200u32
637        };
638        return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 10)) as u16;
639    }
640
641    // The number is normalized, start assembling half precision version
642    let half_sign = sign >> 16;
643    // Unbias the exponent, then bias for half precision
644    let unbiased_exp = ((exp >> 20) as i64) - 1023;
645    let half_exp = unbiased_exp + 15;
646
647    // Check for exponent overflow, return +infinity
648    if half_exp >= 0x1F {
649        return (half_sign | 0x7C00u32) as u16;
650    }
651
652    // Check for underflow
653    if half_exp <= 0 {
654        // Check mantissa for what we can do
655        if 10 - half_exp > 21 {
656            // No rounding possibility, so this is a full underflow, return signed zero
657            return half_sign as u16;
658        }
659        // Don't forget about hidden leading mantissa bit when assembling mantissa
660        let man = man | 0x0010_0000u32;
661        let mut half_man = man >> (11 - half_exp);
662        // Check for rounding (see comment above functions)
663        let round_bit = 1 << (10 - half_exp);
664        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
665            half_man += 1;
666        }
667        // No exponent for subnormals
668        return (half_sign | half_man) as u16;
669    }
670
671    // Rebias the exponent
672    let half_exp = (half_exp as u32) << 10;
673    let half_man = man >> 10;
674    // Check for rounding (see comment above functions)
675    let round_bit = 0x0000_0200u32;
676    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
677        // Round it
678        ((half_sign | half_exp | half_man) + 1) as u16
679    } else {
680        (half_sign | half_exp | half_man) as u16
681    }
682}
683
684#[inline]
685pub(crate) const fn f16_to_f32_fallback(i: u16) -> f32 {
686    // Check for signed zero
687    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
688    if i & 0x7FFFu16 == 0 {
689        return unsafe { mem::transmute::<u32, f32>((i as u32) << 16) };
690    }
691
692    let half_sign = (i & 0x8000u16) as u32;
693    let half_exp = (i & 0x7C00u16) as u32;
694    let half_man = (i & 0x03FFu16) as u32;
695
696    // Check for an infinity or NaN when all exponent bits set
697    if half_exp == 0x7C00u32 {
698        // Check for signed infinity if mantissa is zero
699        if half_man == 0 {
700            return unsafe { mem::transmute::<u32, f32>((half_sign << 16) | 0x7F80_0000u32) };
701        } else {
702            // NaN, keep current mantissa but also set most significiant mantissa bit
703            return unsafe {
704                mem::transmute::<u32, f32>((half_sign << 16) | 0x7FC0_0000u32 | (half_man << 13))
705            };
706        }
707    }
708
709    // Calculate single-precision components with adjusted exponent
710    let sign = half_sign << 16;
711    // Unbias exponent
712    let unbiased_exp = ((half_exp as i32) >> 10) - 15;
713
714    // Check for subnormals, which will be normalized by adjusting exponent
715    if half_exp == 0 {
716        // Calculate how much to adjust the exponent by
717        let e = leading_zeros_u16(half_man as u16) - 6;
718
719        // Rebias and adjust exponent
720        let exp = (127 - 15 - e) << 23;
721        let man = (half_man << (14 + e)) & 0x7F_FF_FFu32;
722        return unsafe { mem::transmute::<u32, f32>(sign | exp | man) };
723    }
724
725    // Rebias exponent for a normalized normal
726    let exp = ((unbiased_exp + 127) as u32) << 23;
727    let man = (half_man & 0x03FFu32) << 13;
728    unsafe { mem::transmute::<u32, f32>(sign | exp | man) }
729}
730
731#[inline]
732pub(crate) const fn f16_to_f64_fallback(i: u16) -> f64 {
733    // Check for signed zero
734    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
735    if i & 0x7FFFu16 == 0 {
736        return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) };
737    }
738
739    let half_sign = (i & 0x8000u16) as u64;
740    let half_exp = (i & 0x7C00u16) as u64;
741    let half_man = (i & 0x03FFu16) as u64;
742
743    // Check for an infinity or NaN when all exponent bits set
744    if half_exp == 0x7C00u64 {
745        // Check for signed infinity if mantissa is zero
746        if half_man == 0 {
747            return unsafe {
748                mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64)
749            };
750        } else {
751            // NaN, keep current mantissa but also set most significiant mantissa bit
752            return unsafe {
753                mem::transmute::<u64, f64>(
754                    (half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 42),
755                )
756            };
757        }
758    }
759
760    // Calculate double-precision components with adjusted exponent
761    let sign = half_sign << 48;
762    // Unbias exponent
763    let unbiased_exp = ((half_exp as i64) >> 10) - 15;
764
765    // Check for subnormals, which will be normalized by adjusting exponent
766    if half_exp == 0 {
767        // Calculate how much to adjust the exponent by
768        let e = leading_zeros_u16(half_man as u16) - 6;
769
770        // Rebias and adjust exponent
771        let exp = ((1023 - 15 - e) as u64) << 52;
772        let man = (half_man << (43 + e)) & 0xF_FFFF_FFFF_FFFFu64;
773        return unsafe { mem::transmute::<u64, f64>(sign | exp | man) };
774    }
775
776    // Rebias exponent for a normalized normal
777    let exp = ((unbiased_exp + 1023) as u64) << 52;
778    let man = (half_man & 0x03FFu64) << 42;
779    unsafe { mem::transmute::<u64, f64>(sign | exp | man) }
780}
781
782#[inline]
783fn f16x4_to_f32x4_fallback(v: &[u16; 4]) -> [f32; 4] {
784    [
785        f16_to_f32_fallback(v[0]),
786        f16_to_f32_fallback(v[1]),
787        f16_to_f32_fallback(v[2]),
788        f16_to_f32_fallback(v[3]),
789    ]
790}
791
792#[inline]
793fn f32x4_to_f16x4_fallback(v: &[f32; 4]) -> [u16; 4] {
794    [
795        f32_to_f16_fallback(v[0]),
796        f32_to_f16_fallback(v[1]),
797        f32_to_f16_fallback(v[2]),
798        f32_to_f16_fallback(v[3]),
799    ]
800}
801
802#[inline]
803fn f16x4_to_f64x4_fallback(v: &[u16; 4]) -> [f64; 4] {
804    [
805        f16_to_f64_fallback(v[0]),
806        f16_to_f64_fallback(v[1]),
807        f16_to_f64_fallback(v[2]),
808        f16_to_f64_fallback(v[3]),
809    ]
810}
811
812#[inline]
813fn f64x4_to_f16x4_fallback(v: &[f64; 4]) -> [u16; 4] {
814    [
815        f64_to_f16_fallback(v[0]),
816        f64_to_f16_fallback(v[1]),
817        f64_to_f16_fallback(v[2]),
818        f64_to_f16_fallback(v[3]),
819    ]
820}
821
822#[inline]
823fn f16x8_to_f32x8_fallback(v: &[u16; 8]) -> [f32; 8] {
824    [
825        f16_to_f32_fallback(v[0]),
826        f16_to_f32_fallback(v[1]),
827        f16_to_f32_fallback(v[2]),
828        f16_to_f32_fallback(v[3]),
829        f16_to_f32_fallback(v[4]),
830        f16_to_f32_fallback(v[5]),
831        f16_to_f32_fallback(v[6]),
832        f16_to_f32_fallback(v[7]),
833    ]
834}
835
836#[inline]
837fn f32x8_to_f16x8_fallback(v: &[f32; 8]) -> [u16; 8] {
838    [
839        f32_to_f16_fallback(v[0]),
840        f32_to_f16_fallback(v[1]),
841        f32_to_f16_fallback(v[2]),
842        f32_to_f16_fallback(v[3]),
843        f32_to_f16_fallback(v[4]),
844        f32_to_f16_fallback(v[5]),
845        f32_to_f16_fallback(v[6]),
846        f32_to_f16_fallback(v[7]),
847    ]
848}
849
850#[inline]
851fn f16x8_to_f64x8_fallback(v: &[u16; 8]) -> [f64; 8] {
852    [
853        f16_to_f64_fallback(v[0]),
854        f16_to_f64_fallback(v[1]),
855        f16_to_f64_fallback(v[2]),
856        f16_to_f64_fallback(v[3]),
857        f16_to_f64_fallback(v[4]),
858        f16_to_f64_fallback(v[5]),
859        f16_to_f64_fallback(v[6]),
860        f16_to_f64_fallback(v[7]),
861    ]
862}
863
864#[inline]
865fn f64x8_to_f16x8_fallback(v: &[f64; 8]) -> [u16; 8] {
866    [
867        f64_to_f16_fallback(v[0]),
868        f64_to_f16_fallback(v[1]),
869        f64_to_f16_fallback(v[2]),
870        f64_to_f16_fallback(v[3]),
871        f64_to_f16_fallback(v[4]),
872        f64_to_f16_fallback(v[5]),
873        f64_to_f16_fallback(v[6]),
874        f64_to_f16_fallback(v[7]),
875    ]
876}
877
878#[inline]
879fn slice_fallback<S: Copy, D>(src: &[S], dst: &mut [D], f: fn(S) -> D) {
880    assert_eq!(src.len(), dst.len());
881    for (s, d) in src.iter().copied().zip(dst.iter_mut()) {
882        *d = f(s);
883    }
884}
885
886#[inline]
887fn add_f16_fallback(a: u16, b: u16) -> u16 {
888    f32_to_f16(f16_to_f32(a) + f16_to_f32(b))
889}
890
891#[inline]
892fn subtract_f16_fallback(a: u16, b: u16) -> u16 {
893    f32_to_f16(f16_to_f32(a) - f16_to_f32(b))
894}
895
896#[inline]
897fn multiply_f16_fallback(a: u16, b: u16) -> u16 {
898    f32_to_f16(f16_to_f32(a) * f16_to_f32(b))
899}
900
901#[inline]
902fn divide_f16_fallback(a: u16, b: u16) -> u16 {
903    f32_to_f16(f16_to_f32(a) / f16_to_f32(b))
904}
905
906#[inline]
907fn remainder_f16_fallback(a: u16, b: u16) -> u16 {
908    f32_to_f16(f16_to_f32(a) % f16_to_f32(b))
909}
910
911#[inline]
912fn product_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
913    f32_to_f16(iter.map(f16_to_f32).product())
914}
915
916#[inline]
917fn sum_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
918    f32_to_f16(iter.map(f16_to_f32).sum())
919}
920
921// TODO SIMD arithmetic