wide/
f32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f32x8 { avx: m256 }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f32x8 { a : f32x4, b : f32x4 }
12  }
13}
14
15macro_rules! const_f32_as_f32x8 {
16  ($i:ident, $f:expr) => {
17    pub const $i: f32x8 =
18      unsafe { ConstUnionHack256bit { f32a8: [$f; 8] }.f32x8 };
19  };
20}
21
22impl f32x8 {
23  const_f32_as_f32x8!(ONE, 1.0);
24  const_f32_as_f32x8!(HALF, 0.5);
25  const_f32_as_f32x8!(ZERO, 0.0);
26  const_f32_as_f32x8!(E, core::f32::consts::E);
27  const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
28  const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
29  const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
30  const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
31  const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
32  const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
33  const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
34  const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
35  const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
36  const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2);
37  const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10);
38  const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E);
39  const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E);
40  const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2);
41  const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10);
42  const_f32_as_f32x8!(PI, core::f32::consts::PI);
43  const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2);
44  const_f32_as_f32x8!(TAU, core::f32::consts::TAU);
45}
46
47unsafe impl Zeroable for f32x8 {}
48unsafe impl Pod for f32x8 {}
49
50impl Add for f32x8 {
51  type Output = Self;
52  #[inline]
53  #[must_use]
54  fn add(self, rhs: Self) -> Self::Output {
55    pick! {
56      if #[cfg(target_feature="avx")] {
57        Self { avx: add_m256(self.avx, rhs.avx) }
58      } else {
59        Self {
60          a : self.a.add(rhs.a),
61          b : self.b.add(rhs.b),
62        }
63      }
64    }
65  }
66}
67
68impl Sub for f32x8 {
69  type Output = Self;
70  #[inline]
71  #[must_use]
72  fn sub(self, rhs: Self) -> Self::Output {
73    pick! {
74      if #[cfg(target_feature="avx")] {
75        Self { avx: sub_m256(self.avx, rhs.avx) }
76      } else {
77        Self {
78          a : self.a.sub(rhs.a),
79          b : self.b.sub(rhs.b),
80        }
81      }
82    }
83  }
84}
85
86impl Mul for f32x8 {
87  type Output = Self;
88  #[inline]
89  #[must_use]
90  fn mul(self, rhs: Self) -> Self::Output {
91    pick! {
92      if #[cfg(target_feature="avx")] {
93        Self { avx: mul_m256(self.avx, rhs.avx) }
94      } else {
95        Self {
96          a : self.a.mul(rhs.a),
97          b : self.b.mul(rhs.b),
98        }
99      }
100    }
101  }
102}
103
104impl Div for f32x8 {
105  type Output = Self;
106  #[inline]
107  #[must_use]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Add<f32> for f32x8 {
123  type Output = Self;
124  #[inline]
125  #[must_use]
126  fn add(self, rhs: f32) -> Self::Output {
127    self.add(Self::splat(rhs))
128  }
129}
130
131impl Sub<f32> for f32x8 {
132  type Output = Self;
133  #[inline]
134  #[must_use]
135  fn sub(self, rhs: f32) -> Self::Output {
136    self.sub(Self::splat(rhs))
137  }
138}
139
140impl Mul<f32> for f32x8 {
141  type Output = Self;
142  #[inline]
143  #[must_use]
144  fn mul(self, rhs: f32) -> Self::Output {
145    self.mul(Self::splat(rhs))
146  }
147}
148
149impl Div<f32> for f32x8 {
150  type Output = Self;
151  #[inline]
152  #[must_use]
153  fn div(self, rhs: f32) -> Self::Output {
154    self.div(Self::splat(rhs))
155  }
156}
157
158impl Add<f32x8> for f32 {
159  type Output = f32x8;
160  #[inline]
161  #[must_use]
162  fn add(self, rhs: f32x8) -> Self::Output {
163    f32x8::splat(self).add(rhs)
164  }
165}
166
167impl Sub<f32x8> for f32 {
168  type Output = f32x8;
169  #[inline]
170  #[must_use]
171  fn sub(self, rhs: f32x8) -> Self::Output {
172    f32x8::splat(self).sub(rhs)
173  }
174}
175
176impl Mul<f32x8> for f32 {
177  type Output = f32x8;
178  #[inline]
179  #[must_use]
180  fn mul(self, rhs: f32x8) -> Self::Output {
181    f32x8::splat(self).mul(rhs)
182  }
183}
184
185impl Div<f32x8> for f32 {
186  type Output = f32x8;
187  #[inline]
188  #[must_use]
189  fn div(self, rhs: f32x8) -> Self::Output {
190    f32x8::splat(self).div(rhs)
191  }
192}
193
194impl BitAnd for f32x8 {
195  type Output = Self;
196  #[inline]
197  #[must_use]
198  fn bitand(self, rhs: Self) -> Self::Output {
199    pick! {
200      if #[cfg(target_feature="avx")] {
201        Self { avx: bitand_m256(self.avx, rhs.avx) }
202      } else {
203        Self {
204          a : self.a.bitand(rhs.a),
205          b : self.b.bitand(rhs.b),
206        }
207      }
208    }
209  }
210}
211
212impl BitOr for f32x8 {
213  type Output = Self;
214  #[inline]
215  #[must_use]
216  fn bitor(self, rhs: Self) -> Self::Output {
217    pick! {
218      if #[cfg(target_feature="avx")] {
219        Self { avx: bitor_m256(self.avx, rhs.avx) }
220      } else {
221        Self {
222          a : self.a.bitor(rhs.a),
223          b : self.b.bitor(rhs.b),
224        }
225      }
226    }
227  }
228}
229
230impl BitXor for f32x8 {
231  type Output = Self;
232  #[inline]
233  #[must_use]
234  fn bitxor(self, rhs: Self) -> Self::Output {
235    pick! {
236      if #[cfg(target_feature="avx")] {
237        Self { avx: bitxor_m256(self.avx, rhs.avx) }
238      } else {
239        Self {
240          a : self.a.bitxor(rhs.a),
241          b : self.b.bitxor(rhs.b),
242        }
243      }
244    }
245  }
246}
247
248impl CmpEq for f32x8 {
249  type Output = Self;
250  #[inline]
251  #[must_use]
252  fn cmp_eq(self, rhs: Self) -> Self::Output {
253    pick! {
254      if #[cfg(target_feature="avx")] {
255        Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
256      } else {
257        Self {
258          a : self.a.cmp_eq(rhs.a),
259          b : self.b.cmp_eq(rhs.b),
260        }
261      }
262    }
263  }
264}
265
266impl CmpGe for f32x8 {
267  type Output = Self;
268  #[inline]
269  #[must_use]
270  fn cmp_ge(self, rhs: Self) -> Self::Output {
271    pick! {
272      if #[cfg(target_feature="avx")] {
273        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
274      } else {
275        Self {
276          a : self.a.cmp_ge(rhs.a),
277          b : self.b.cmp_ge(rhs.b),
278        }
279      }
280    }
281  }
282}
283
284impl CmpGt for f32x8 {
285  type Output = Self;
286  #[inline]
287  #[must_use]
288  fn cmp_gt(self, rhs: Self) -> Self::Output {
289    pick! {
290      if #[cfg(target_feature="avx")] {
291        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) }
292      } else {
293        Self {
294          a : self.a.cmp_gt(rhs.a),
295          b : self.b.cmp_gt(rhs.b),
296        }
297      }
298    }
299  }
300}
301
302impl CmpNe for f32x8 {
303  type Output = Self;
304  #[inline]
305  #[must_use]
306  fn cmp_ne(self, rhs: Self) -> Self::Output {
307    pick! {
308      if #[cfg(target_feature="avx")] {
309        Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
310      } else {
311        Self {
312          a : self.a.cmp_ne(rhs.a),
313          b : self.b.cmp_ne(rhs.b),
314        }
315      }
316    }
317  }
318}
319
320impl CmpLe for f32x8 {
321  type Output = Self;
322  #[inline]
323  #[must_use]
324  fn cmp_le(self, rhs: Self) -> Self::Output {
325    pick! {
326      if #[cfg(target_feature="avx")] {
327        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
328      } else {
329        Self {
330          a : self.a.cmp_le(rhs.a),
331          b : self.b.cmp_le(rhs.b),
332        }
333      }
334    }
335  }
336}
337
338impl CmpLt for f32x8 {
339  type Output = Self;
340  #[inline]
341  #[must_use]
342  fn cmp_lt(self, rhs: Self) -> Self::Output {
343    pick! {
344      if #[cfg(target_feature="avx")] {
345        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
346      } else {
347        Self {
348          a : self.a.cmp_lt(rhs.a),
349          b : self.b.cmp_lt(rhs.b),
350        }
351      }
352    }
353  }
354}
355
356impl f32x8 {
357  #[inline]
358  #[must_use]
359  pub fn new(array: [f32; 8]) -> Self {
360    Self::from(array)
361  }
362  #[inline]
363  #[must_use]
364  pub fn blend(self, t: Self, f: Self) -> Self {
365    pick! {
366      if #[cfg(target_feature="avx")] {
367        Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) }
368      } else {
369        Self {
370          a : self.a.blend(t.a, f.a),
371          b : self.b.blend(t.b, f.b),
372        }
373      }
374    }
375  }
376  #[inline]
377  #[must_use]
378  pub fn abs(self) -> Self {
379    pick! {
380      if #[cfg(target_feature="avx")] {
381        let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32));
382        self & non_sign_bits
383      } else {
384        Self {
385          a : self.a.abs(),
386          b : self.b.abs(),
387        }
388      }
389    }
390  }
391
392  /// Calculates the lanewise maximum of both vectors. This is a faster
393  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
394  /// involved.
395  #[inline]
396  #[must_use]
397  pub fn fast_max(self, rhs: Self) -> Self {
398    pick! {
399      if #[cfg(target_feature="avx")] {
400        Self { avx: max_m256(self.avx, rhs.avx) }
401      } else {
402        Self {
403          a : self.a.fast_max(rhs.a),
404          b : self.b.fast_max(rhs.b),
405        }
406      }
407    }
408  }
409
410  /// Calculates the lanewise maximum of both vectors. This doesn't match
411  /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`.
412  #[inline]
413  #[must_use]
414  pub fn max(self, rhs: Self) -> Self {
415    pick! {
416      if #[cfg(target_feature="avx")] {
417        // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN
418        // involved, it chooses rhs, so we need to specifically check rhs for
419        // NaN.
420        rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) })
421      } else {
422        Self {
423          a : self.a.max(rhs.a),
424          b : self.b.max(rhs.b),
425        }
426      }
427
428    }
429  }
430
431  /// Calculates the lanewise minimum of both vectors. This is a faster
432  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
433  /// involved.
434  #[inline]
435  #[must_use]
436  pub fn fast_min(self, rhs: Self) -> Self {
437    pick! {
438      if #[cfg(target_feature="avx")] {
439        Self { avx: min_m256(self.avx, rhs.avx) }
440      } else {
441        Self {
442          a : self.a.fast_min(rhs.a),
443          b : self.b.fast_min(rhs.b),
444        }
445      }
446    }
447  }
448
449  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
450  /// the other lane gets chosen. Use `fast_min` for a faster implementation
451  /// that doesn't handle NaNs.
452  #[inline]
453  #[must_use]
454  pub fn min(self, rhs: Self) -> Self {
455    pick! {
456      if #[cfg(target_feature="avx")] {
457        // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN
458        // involved, it chooses rhs, so we need to specifically check rhs for
459        // NaN.
460        rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) })
461      } else {
462        Self {
463          a : self.a.min(rhs.a),
464          b : self.b.min(rhs.b),
465        }
466      }
467    }
468  }
469  #[inline]
470  #[must_use]
471  pub fn is_nan(self) -> Self {
472    pick! {
473      if #[cfg(target_feature="avx")] {
474        Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) }
475      } else {
476        Self {
477          a : self.a.is_nan(),
478          b : self.b.is_nan(),
479        }
480      }
481    }
482  }
483  #[inline]
484  #[must_use]
485  pub fn is_finite(self) -> Self {
486    let shifted_exp_mask = u32x8::from(0xFF000000);
487    let u: u32x8 = cast(self);
488    let shift_u = u << 1_u64;
489    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
490    cast(out)
491  }
492  #[inline]
493  #[must_use]
494  pub fn is_inf(self) -> Self {
495    let shifted_inf = u32x8::from(0xFF000000);
496    let u: u32x8 = cast(self);
497    let shift_u = u << 1_u64;
498    let out = (shift_u).cmp_eq(shifted_inf);
499    cast(out)
500  }
501
502  #[inline]
503  #[must_use]
504  pub fn round(self) -> Self {
505    pick! {
506      // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out
507      if #[cfg(target_feature="avx")] {
508        Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) }
509      } else {
510        Self {
511          a : self.a.round(),
512          b : self.b.round(),
513        }
514      }
515    }
516  }
517
518  /// Rounds each lane into an integer. This is a faster implementation than
519  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
520  /// values you get implementation defined behavior.
521  #[inline]
522  #[must_use]
523  pub fn fast_round_int(self) -> i32x8 {
524    pick! {
525      if #[cfg(target_feature="avx")] {
526        cast(convert_to_i32_m256i_from_m256(self.avx))
527      } else {
528        cast([
529          self.a.fast_round_int(),
530          self.b.fast_round_int()])
531      }
532    }
533  }
534
535  /// Rounds each lane into an integer. This saturates out of range values and
536  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
537  /// doesn't handle out of range values or NaNs.
538  #[inline]
539  #[must_use]
540  pub fn round_int(self) -> i32x8 {
541    pick! {
542      if #[cfg(target_feature="avx")] {
543        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
544        let non_nan_mask = self.cmp_eq(self);
545        let non_nan = self & non_nan_mask;
546        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
547        let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
548        flip_to_max ^ cast
549      } else {
550        cast([
551          self.a.round_int(),
552          self.b.round_int(),
553        ])
554      }
555    }
556  }
557
558  /// Truncates each lane into an integer. This is a faster implementation than
559  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
560  /// values you get implementation defined behavior.
561  #[inline]
562  #[must_use]
563  pub fn fast_trunc_int(self) -> i32x8 {
564    pick! {
565      if #[cfg(all(target_feature="avx"))] {
566        cast(convert_truncate_to_i32_m256i_from_m256(self.avx))
567      } else {
568        cast([
569          self.a.fast_trunc_int(),
570          self.b.fast_trunc_int(),
571        ])
572      }
573    }
574  }
575
576  /// Truncates each lane into an integer. This saturates out of range values
577  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
578  /// that doesn't handle out of range values or NaNs.
579  #[inline]
580  #[must_use]
581  pub fn trunc_int(self) -> i32x8 {
582    pick! {
583        if #[cfg(target_feature="avx")] {
584        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
585        let non_nan_mask = self.cmp_eq(self);
586        let non_nan = self & non_nan_mask;
587        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
588        let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
589        flip_to_max ^ cast
590      } else {
591        cast([
592          self.a.trunc_int(),
593          self.b.trunc_int(),
594        ])
595      }
596    }
597  }
598  #[inline]
599  #[must_use]
600  pub fn mul_add(self, m: Self, a: Self) -> Self {
601    pick! {
602      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
603        Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) }
604      } else if #[cfg(target_feature="avx")] {
605        // still want to use 256 bit ops
606        (self * m) + a
607      } else {
608        Self {
609          a : self.a.mul_add(m.a, a.a),
610          b : self.b.mul_add(m.b, a.b),
611        }
612      }
613    }
614  }
615
616  #[inline]
617  #[must_use]
618  pub fn mul_sub(self, m: Self, a: Self) -> Self {
619    pick! {
620      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
621        Self { avx: fused_mul_sub_m256(self.avx, m.avx, a.avx) }
622      } else if #[cfg(target_feature="avx")] {
623        // still want to use 256 bit ops
624        (self * m) - a
625      } else {
626        Self {
627          a : self.a.mul_sub(m.a, a.a),
628          b : self.b.mul_sub(m.b, a.b),
629        }
630      }
631    }
632  }
633
634  #[inline]
635  #[must_use]
636  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
637    pick! {
638      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
639        Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) }
640      } else if #[cfg(target_feature="avx")] {
641        // still want to use 256 bit ops
642        a - (self * m)
643      } else {
644        Self {
645          a : self.a.mul_neg_add(m.a, a.a),
646          b : self.b.mul_neg_add(m.b, a.b),
647        }
648      }
649    }
650  }
651
652  #[inline]
653  #[must_use]
654  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
655    pick! {
656      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
657        Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, a.avx) }
658      } else if #[cfg(target_feature="avx")] {
659        // still want to use 256 bit ops
660        -(self * m) - a
661      } else {
662        Self {
663          a : self.a.mul_neg_sub(m.a, a.a),
664          b : self.b.mul_neg_sub(m.b, a.b),
665        }
666      }
667    }
668  }
669
670  #[inline]
671  #[must_use]
672  pub fn flip_signs(self, signs: Self) -> Self {
673    self ^ (signs & Self::from(-0.0))
674  }
675
676  #[inline]
677  #[must_use]
678  pub fn copysign(self, sign: Self) -> Self {
679    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
680    (self & magnitude_mask) | (sign & Self::from(-0.0))
681  }
682
683  #[allow(non_upper_case_globals)]
684  #[inline]
685  pub fn asin_acos(self) -> (Self, Self) {
686    // Based on the Agner Fog "vector class library":
687    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
688    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
689    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
690    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
691    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
692    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
693
694    let xa = self.abs();
695    let big = xa.cmp_ge(f32x8::splat(0.5));
696
697    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
698    let x2 = xa * xa;
699    let x3 = big.blend(x1, x2);
700
701    let xb = x1.sqrt();
702
703    let x4 = big.blend(xb, xa);
704
705    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
706    let z = z.mul_add(x3 * x4, x4);
707
708    let z1 = z + z;
709
710    // acos
711    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
712    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
713    let acos = big.blend(z3, z4);
714
715    // asin
716    let z3 = f32x8::FRAC_PI_2 - z1;
717    let asin = big.blend(z3, z);
718    let asin = asin.flip_signs(self);
719
720    (asin, acos)
721  }
722
723  #[inline]
724  #[must_use]
725  #[allow(non_upper_case_globals)]
726  pub fn asin(self) -> Self {
727    // Based on the Agner Fog "vector class library":
728    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
729    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
730    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
731    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
732    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
733    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
734
735    let xa = self.abs();
736    let big = xa.cmp_ge(f32x8::splat(0.5));
737
738    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
739    let x2 = xa * xa;
740    let x3 = big.blend(x1, x2);
741
742    let xb = x1.sqrt();
743
744    let x4 = big.blend(xb, xa);
745
746    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
747    let z = z.mul_add(x3 * x4, x4);
748
749    let z1 = z + z;
750
751    // asin
752    let z3 = f32x8::FRAC_PI_2 - z1;
753    let asin = big.blend(z3, z);
754    let asin = asin.flip_signs(self);
755
756    asin
757  }
758
759  #[inline]
760  #[must_use]
761  #[allow(non_upper_case_globals)]
762  pub fn acos(self) -> Self {
763    // Based on the Agner Fog "vector class library":
764    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
765    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
766    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
767    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
768    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
769    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
770
771    let xa = self.abs();
772    let big = xa.cmp_ge(f32x8::splat(0.5));
773
774    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
775    let x2 = xa * xa;
776    let x3 = big.blend(x1, x2);
777
778    let xb = x1.sqrt();
779
780    let x4 = big.blend(xb, xa);
781
782    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
783    let z = z.mul_add(x3 * x4, x4);
784
785    let z1 = z + z;
786
787    // acos
788    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
789    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
790    let acos = big.blend(z3, z4);
791
792    acos
793  }
794
795  #[allow(non_upper_case_globals)]
796  #[inline]
797  pub fn atan(self) -> Self {
798    // Based on the Agner Fog "vector class library":
799    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
800    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
801    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
802    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
803    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
804
805    let t = self.abs();
806
807    // small:  z = t / 1.0;
808    // medium: z = (t-1.0) / (t+1.0);
809    // big:    z = -1.0 / t;
810    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
811    let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
812
813    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
814    s = notsmal & s;
815
816    let mut a = notbig & t;
817    a = notsmal.blend(a - Self::ONE, a);
818    let mut b = notbig & Self::ONE;
819    b = notsmal.blend(b + t, b);
820    let z = a / b;
821
822    let zz = z * z;
823
824    // Taylor expansion
825    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
826    re = re.mul_add(zz * z, z) + s;
827
828    // get sign bit
829    re = (self.sign_bit()).blend(-re, re);
830
831    re
832  }
833
834  #[allow(non_upper_case_globals)]
835  #[inline]
836  pub fn atan2(self, x: Self) -> Self {
837    // Based on the Agner Fog "vector class library":
838    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
839    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
840    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
841    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
842    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
843
844    let y = self;
845
846    // move in first octant
847    let x1 = x.abs();
848    let y1 = y.abs();
849    let swapxy = y1.cmp_gt(x1);
850    // swap x and y if y1 > x1
851    let mut x2 = swapxy.blend(y1, x1);
852    let mut y2 = swapxy.blend(x1, y1);
853
854    // check for special case: x and y are both +/- INF
855    let both_infinite = x.is_inf() & y.is_inf();
856    if both_infinite.any() {
857      let minus_one = -Self::ONE;
858      x2 = both_infinite.blend(x2 & minus_one, x2);
859      y2 = both_infinite.blend(y2 & minus_one, y2);
860    }
861
862    // x = y = 0 will produce NAN. No problem, fixed below
863    let t = y2 / x2;
864
865    // small:  z = t / 1.0;
866    // medium: z = (t-1.0) / (t+1.0);
867    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
868
869    let a = notsmal.blend(t - Self::ONE, t);
870    let b = notsmal.blend(t + Self::ONE, Self::ONE);
871    let s = notsmal & Self::FRAC_PI_4;
872    let z = a / b;
873
874    let zz = z * z;
875
876    // Taylor expansion
877    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
878    re = re.mul_add(zz * z, z) + s;
879
880    // move back in place
881    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
882    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
883    re = (x.sign_bit()).blend(Self::PI - re, re);
884
885    // get sign bit
886    re = (y.sign_bit()).blend(-re, re);
887
888    re
889  }
890
891  #[inline]
892  #[must_use]
893  #[allow(non_upper_case_globals)]
894  pub fn sin_cos(self) -> (Self, Self) {
895    // Based on the Agner Fog "vector class library":
896    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
897
898    const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0);
899    const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
900    const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
901
902    const_f32_as_f32x8!(P0sinf, -1.6666654611E-1);
903    const_f32_as_f32x8!(P1sinf, 8.3321608736E-3);
904    const_f32_as_f32x8!(P2sinf, -1.9515295891E-4);
905
906    const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2);
907    const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3);
908    const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5);
909
910    const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
911
912    let xa = self.abs();
913
914    // Find quadrant
915    let y = (xa * TWO_OVER_PI).round();
916    let q: i32x8 = y.round_int();
917
918    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
919
920    let x2 = x * x;
921    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
922    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
923      + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0));
924
925    let swap = !(q & i32x8::from(1)).cmp_eq(i32x8::from(0));
926
927    let mut overflow: f32x8 = cast(q.cmp_gt(i32x8::from(0x2000000)));
928    overflow &= xa.is_finite();
929    s = overflow.blend(f32x8::from(0.0), s);
930    c = overflow.blend(f32x8::from(1.0), c);
931
932    // calc sin
933    let mut sin1 = cast::<_, f32x8>(swap).blend(c, s);
934    let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self);
935    sin1 = sin1.flip_signs(cast(sign_sin));
936
937    // calc cos
938    let mut cos1 = cast::<_, f32x8>(swap).blend(s, c);
939    let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30;
940    cos1 ^= cast::<_, f32x8>(sign_cos);
941
942    (sin1, cos1)
943  }
944  #[inline]
945  #[must_use]
946  pub fn sin(self) -> Self {
947    let (s, _) = self.sin_cos();
948    s
949  }
950  #[inline]
951  #[must_use]
952  pub fn cos(self) -> Self {
953    let (_, c) = self.sin_cos();
954    c
955  }
956  #[inline]
957  #[must_use]
958  pub fn tan(self) -> Self {
959    let (s, c) = self.sin_cos();
960    s / c
961  }
962  #[inline]
963  #[must_use]
964  pub fn to_degrees(self) -> Self {
965    const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
966    self * RAD_TO_DEG_RATIO
967  }
968  #[inline]
969  #[must_use]
970  pub fn to_radians(self) -> Self {
971    const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
972    self * DEG_TO_RAD_RATIO
973  }
974  #[inline]
975  #[must_use]
976  pub fn recip(self) -> Self {
977    pick! {
978      if #[cfg(target_feature="avx")] {
979        Self { avx: reciprocal_m256(self.avx) }
980      } else {
981        Self {
982          a : self.a.recip(),
983          b : self.b.recip(),
984        }
985      }
986    }
987  }
988  #[inline]
989  #[must_use]
990  pub fn recip_sqrt(self) -> Self {
991    pick! {
992      if #[cfg(target_feature="avx")] {
993        Self { avx: reciprocal_sqrt_m256(self.avx) }
994      } else {
995        Self {
996          a : self.a.recip_sqrt(),
997          b : self.b.recip_sqrt(),
998        }
999      }
1000    }
1001  }
1002  #[inline]
1003  #[must_use]
1004  pub fn sqrt(self) -> Self {
1005    pick! {
1006      if #[cfg(target_feature="avx")] {
1007        Self { avx: sqrt_m256(self.avx) }
1008      } else {
1009        Self {
1010          a : self.a.sqrt(),
1011          b : self.b.sqrt(),
1012        }
1013      }
1014    }
1015  }
1016  #[inline]
1017  #[must_use]
1018  pub fn move_mask(self) -> i32 {
1019    pick! {
1020      if #[cfg(target_feature="avx")] {
1021        move_mask_m256(self.avx)
1022      } else {
1023        (self.b.move_mask() << 4) | self.a.move_mask()
1024      }
1025    }
1026  }
1027  #[inline]
1028  #[must_use]
1029  pub fn any(self) -> bool {
1030    pick! {
1031      if #[cfg(target_feature="avx")] {
1032        move_mask_m256(self.avx) != 0
1033      } else {
1034        self.a.any() || self.b.any()
1035      }
1036    }
1037  }
1038  #[inline]
1039  #[must_use]
1040  pub fn all(self) -> bool {
1041    pick! {
1042      if #[cfg(target_feature="avx")] {
1043        move_mask_m256(self.avx) == 0b11111111
1044      } else {
1045        self.a.all() && self.b.all()
1046      }
1047    }
1048  }
1049  #[inline]
1050  #[must_use]
1051  pub fn none(self) -> bool {
1052    !self.any()
1053  }
1054
1055  #[inline]
1056  #[allow(non_upper_case_globals)]
1057  fn vm_pow2n(self) -> Self {
1058    const_f32_as_f32x8!(pow2_23, 8388608.0);
1059    const_f32_as_f32x8!(bias, 127.0);
1060    let a = self + (bias + pow2_23);
1061    let c = cast::<_, i32x8>(a) << 23;
1062    cast::<_, f32x8>(c)
1063  }
1064
1065  /// Calculate the exponent of a packed f32x8
1066  #[inline]
1067  #[must_use]
1068  #[allow(non_upper_case_globals)]
1069  pub fn exp(self) -> Self {
1070    const_f32_as_f32x8!(P0, 1.0 / 2.0);
1071    const_f32_as_f32x8!(P1, 1.0 / 6.0);
1072    const_f32_as_f32x8!(P2, 1. / 24.);
1073    const_f32_as_f32x8!(P3, 1. / 120.);
1074    const_f32_as_f32x8!(P4, 1. / 720.);
1075    const_f32_as_f32x8!(P5, 1. / 5040.);
1076    const_f32_as_f32x8!(LN2D_HI, 0.693359375);
1077    const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4);
1078    let max_x = f32x8::from(87.3);
1079    let r = (self * Self::LOG2_E).round();
1080    let x = r.mul_neg_add(LN2D_HI, self);
1081    let x = r.mul_neg_add(LN2D_LO, x);
1082    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1083    let x2 = x * x;
1084    let z = z.mul_add(x2, x);
1085    let n2 = Self::vm_pow2n(r);
1086    let z = (z + Self::ONE) * n2;
1087    // check for overflow
1088    let in_range = self.abs().cmp_lt(max_x);
1089    let in_range = in_range & self.is_finite();
1090    in_range.blend(z, Self::ZERO)
1091  }
1092
1093  #[inline]
1094  #[allow(non_upper_case_globals)]
1095  fn exponent(self) -> f32x8 {
1096    const_f32_as_f32x8!(pow2_23, 8388608.0);
1097    const_f32_as_f32x8!(bias, 127.0);
1098    let a = cast::<_, u32x8>(self);
1099    let b = a >> 23;
1100    let c = b | cast::<_, u32x8>(pow2_23);
1101    let d = cast::<_, f32x8>(c);
1102    let e = d - (pow2_23 + bias);
1103    e
1104  }
1105
1106  #[inline]
1107  #[allow(non_upper_case_globals)]
1108  fn fraction_2(self) -> Self {
1109    let t1 = cast::<_, u32x8>(self);
1110    let t2 = cast::<_, u32x8>(
1111      (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000),
1112    );
1113    cast::<_, f32x8>(t2)
1114  }
1115  #[inline]
1116  fn is_zero_or_subnormal(self) -> Self {
1117    let t = cast::<_, i32x8>(self);
1118    let t = t & i32x8::splat(0x7F800000);
1119    i32x8::round_float(t.cmp_eq(i32x8::splat(0)))
1120  }
1121  #[inline]
1122  fn infinity() -> Self {
1123    cast::<_, f32x8>(i32x8::splat(0x7F800000))
1124  }
1125  #[inline]
1126  fn nan_log() -> Self {
1127    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1128  }
1129  #[inline]
1130  fn nan_pow() -> Self {
1131    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1132  }
1133  #[inline]
1134  pub fn sign_bit(self) -> Self {
1135    let t1 = cast::<_, i32x8>(self);
1136    let t2 = t1 >> 31;
1137    !cast::<_, f32x8>(t2).cmp_eq(f32x8::ZERO)
1138  }
1139
1140  /// horizontal add of all the elements of the vector
1141  #[inline]
1142  #[must_use]
1143  pub fn reduce_add(self) -> f32 {
1144    pick! {
1145      // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally
1146      if #[cfg(target_feature="avx")]{
1147        let hi_quad = extract_m128_from_m256::<1>(self.avx);
1148        let lo_quad = cast_to_m128_from_m256(self.avx);
1149        let sum_quad = add_m128(lo_quad,hi_quad);
1150        let lo_dual = sum_quad;
1151        let hi_dual = move_high_low_m128(sum_quad,sum_quad);
1152        let sum_dual = add_m128(lo_dual,hi_dual);
1153        let lo = sum_dual;
1154        let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual);
1155        let sum = add_m128_s(lo, hi);
1156        get_f32_from_m128_s(sum)
1157      } else {
1158        self.a.reduce_add() + self.b.reduce_add()
1159      }
1160    }
1161  }
1162
1163  /// Natural log (ln(x))
1164  #[inline]
1165  #[must_use]
1166  #[allow(non_upper_case_globals)]
1167  pub fn ln(self) -> Self {
1168    const_f32_as_f32x8!(HALF, 0.5);
1169    const_f32_as_f32x8!(P0, 3.3333331174E-1);
1170    const_f32_as_f32x8!(P1, -2.4999993993E-1);
1171    const_f32_as_f32x8!(P2, 2.0000714765E-1);
1172    const_f32_as_f32x8!(P3, -1.6668057665E-1);
1173    const_f32_as_f32x8!(P4, 1.4249322787E-1);
1174    const_f32_as_f32x8!(P5, -1.2420140846E-1);
1175    const_f32_as_f32x8!(P6, 1.1676998740E-1);
1176    const_f32_as_f32x8!(P7, -1.1514610310E-1);
1177    const_f32_as_f32x8!(P8, 7.0376836292E-2);
1178    const_f32_as_f32x8!(LN2F_HI, 0.693359375);
1179    const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4);
1180    const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1181
1182    let x1 = self;
1183    let x = Self::fraction_2(x1);
1184    let e = Self::exponent(x1);
1185    let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1186    let x = (!mask).blend(x + x, x);
1187    let fe = mask.blend(e + Self::ONE, e);
1188    let x = x - Self::ONE;
1189    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1190    let x2 = x * x;
1191    let res = x2 * x * res;
1192    let res = fe.mul_add(LN2F_LO, res);
1193    let res = res + x2.mul_neg_add(HALF, x);
1194    let res = fe.mul_add(LN2F_HI, res);
1195    let overflow = !self.is_finite();
1196    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1197    let mask = overflow | underflow;
1198    if !mask.any() {
1199      res
1200    } else {
1201      let is_zero = self.is_zero_or_subnormal();
1202      let res = underflow.blend(Self::nan_log(), res);
1203      let res = is_zero.blend(Self::infinity(), res);
1204      let res = overflow.blend(self, res);
1205      res
1206    }
1207  }
1208
1209  #[inline]
1210  #[must_use]
1211  pub fn log2(self) -> Self {
1212    Self::ln(self) * Self::LOG2_E
1213  }
1214  #[inline]
1215  #[must_use]
1216  pub fn log10(self) -> Self {
1217    Self::ln(self) * Self::LOG10_E
1218  }
1219
1220  #[inline]
1221  #[must_use]
1222  #[allow(non_upper_case_globals)]
1223  pub fn pow_f32x8(self, y: Self) -> Self {
1224    const_f32_as_f32x8!(ln2f_hi, 0.693359375);
1225    const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4);
1226    const_f32_as_f32x8!(P0logf, 3.3333331174E-1);
1227    const_f32_as_f32x8!(P1logf, -2.4999993993E-1);
1228    const_f32_as_f32x8!(P2logf, 2.0000714765E-1);
1229    const_f32_as_f32x8!(P3logf, -1.6668057665E-1);
1230    const_f32_as_f32x8!(P4logf, 1.4249322787E-1);
1231    const_f32_as_f32x8!(P5logf, -1.2420140846E-1);
1232    const_f32_as_f32x8!(P6logf, 1.1676998740E-1);
1233    const_f32_as_f32x8!(P7logf, -1.1514610310E-1);
1234    const_f32_as_f32x8!(P8logf, 7.0376836292E-2);
1235
1236    const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1237    const_f32_as_f32x8!(p3expf, 1.0 / 6.0);
1238    const_f32_as_f32x8!(p4expf, 1.0 / 24.0);
1239    const_f32_as_f32x8!(p5expf, 1.0 / 120.0);
1240    const_f32_as_f32x8!(p6expf, 1.0 / 720.0);
1241    const_f32_as_f32x8!(p7expf, 1.0 / 5040.0);
1242
1243    let x1 = self.abs();
1244    let x = x1.fraction_2();
1245    let mask = x.cmp_gt(f32x8::SQRT_2 * f32x8::HALF);
1246    let x = (!mask).blend(x + x, x);
1247
1248    let x = x - f32x8::ONE;
1249    let x2 = x * x;
1250    let lg1 = polynomial_8!(
1251      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1252    );
1253    let lg1 = lg1 * x2 * x;
1254
1255    let ef = x1.exponent();
1256    let ef = mask.blend(ef + f32x8::ONE, ef);
1257    let e1 = (ef * y).round();
1258    let yr = ef.mul_sub(y, e1);
1259
1260    let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1;
1261    let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2);
1262    let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1;
1263
1264    let e2 = (lg * y * f32x8::LOG2_E).round();
1265    let v = lg.mul_sub(y, e2 * ln2f_hi);
1266    let v = e2.mul_neg_add(ln2f_lo, v);
1267    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2);
1268
1269    let x = v;
1270    let e3 = (x * f32x8::LOG2_E).round();
1271    let x = e3.mul_neg_add(f32x8::LN_2, x);
1272    let x2 = x * x;
1273    let z = x2.mul_add(
1274      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1275      x + f32x8::ONE,
1276    );
1277
1278    let ee = e1 + e2 + e3;
1279    let ei = cast::<_, i32x8>(ee.round_int());
1280    let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23));
1281
1282    let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF)))
1283      | (ee.cmp_gt(f32x8::splat(300.0)));
1284    let underflow = cast::<_, f32x8>(ej.cmp_lt(i32x8::splat(0x000)))
1285      | (ee.cmp_lt(f32x8::splat(-300.0)));
1286
1287    // Add exponent by integer addition
1288    let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23));
1289    // Check for overflow/underflow
1290    let z = underflow.blend(f32x8::ZERO, z);
1291    let z = overflow.blend(Self::infinity(), z);
1292
1293    // Check for self == 0
1294    let x_zero = self.is_zero_or_subnormal();
1295    let z = x_zero.blend(
1296      y.cmp_lt(f32x8::ZERO).blend(
1297        Self::infinity(),
1298        y.cmp_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO),
1299      ),
1300      z,
1301    );
1302
1303    let x_sign = self.sign_bit();
1304    let z = if x_sign.any() {
1305      // Y into an integer
1306      let yi = y.cmp_eq(y.round());
1307
1308      // Is y odd?
1309      let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float();
1310
1311      let z1 =
1312        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1313
1314      x_sign.blend(z1, z)
1315    } else {
1316      z
1317    };
1318
1319    let x_finite = self.is_finite();
1320    let y_finite = y.is_finite();
1321    let e_finite = ee.is_finite();
1322    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1323      return z;
1324    }
1325
1326    (self.is_nan() | y.is_nan()).blend(self + y, z)
1327  }
1328  #[inline]
1329  pub fn powf(self, y: f32) -> Self {
1330    Self::pow_f32x8(self, f32x8::splat(y))
1331  }
1332
1333  /// Transpose matrix of 8x8 f32 matrix. Currently only accelerated on AVX.
1334  #[must_use]
1335  #[inline]
1336  pub fn transpose(data: [f32x8; 8]) -> [f32x8; 8] {
1337    pick! {
1338      if #[cfg(target_feature="avx")] {
1339        let a0 = unpack_lo_m256(data[0].avx, data[1].avx);
1340        let a1 = unpack_hi_m256(data[0].avx, data[1].avx);
1341        let a2 = unpack_lo_m256(data[2].avx, data[3].avx);
1342        let a3 = unpack_hi_m256(data[2].avx, data[3].avx);
1343        let a4 = unpack_lo_m256(data[4].avx, data[5].avx);
1344        let a5 = unpack_hi_m256(data[4].avx, data[5].avx);
1345        let a6 = unpack_lo_m256(data[6].avx, data[7].avx);
1346        let a7 = unpack_hi_m256(data[6].avx, data[7].avx);
1347
1348        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
1349          (z << 6) | (y << 4) | (x << 2) | w
1350        }
1351
1352        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
1353        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
1354
1355        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
1356        // (since blend runs on a different port than shuffle)
1357        let b0 = shuffle_m256::<SHUFF_LO>(a0,a2);
1358        let b1 = shuffle_m256::<SHUFF_HI>(a0,a2);
1359        let b2 = shuffle_m256::<SHUFF_LO>(a1,a3);
1360        let b3 = shuffle_m256::<SHUFF_HI>(a1,a3);
1361        let b4 = shuffle_m256::<SHUFF_LO>(a4,a6);
1362        let b5 = shuffle_m256::<SHUFF_HI>(a4,a6);
1363        let b6 = shuffle_m256::<SHUFF_LO>(a5,a7);
1364        let b7 = shuffle_m256::<SHUFF_HI>(a5,a7);
1365
1366        [
1367          f32x8 { avx: permute2z_m256::<0x20>(b0, b4) },
1368          f32x8 { avx: permute2z_m256::<0x20>(b1, b5) },
1369          f32x8 { avx: permute2z_m256::<0x20>(b2, b6) },
1370          f32x8 { avx: permute2z_m256::<0x20>(b3, b7) },
1371          f32x8 { avx: permute2z_m256::<0x31>(b0, b4) },
1372          f32x8 { avx: permute2z_m256::<0x31>(b1, b5) },
1373          f32x8 { avx: permute2z_m256::<0x31>(b2, b6) },
1374          f32x8 { avx: permute2z_m256::<0x31>(b3, b7) }
1375        ]
1376      } else {
1377        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
1378
1379        #[inline(always)]
1380        fn transpose_column(data: &[f32x8; 8], index: usize) -> f32x8 {
1381          f32x8::new([
1382            data[0].as_array_ref()[index],
1383            data[1].as_array_ref()[index],
1384            data[2].as_array_ref()[index],
1385            data[3].as_array_ref()[index],
1386            data[4].as_array_ref()[index],
1387            data[5].as_array_ref()[index],
1388            data[6].as_array_ref()[index],
1389            data[7].as_array_ref()[index],
1390          ])
1391        }
1392
1393        [
1394          transpose_column(&data, 0),
1395          transpose_column(&data, 1),
1396          transpose_column(&data, 2),
1397          transpose_column(&data, 3),
1398          transpose_column(&data, 4),
1399          transpose_column(&data, 5),
1400          transpose_column(&data, 6),
1401          transpose_column(&data, 7),
1402        ]
1403      }
1404    }
1405  }
1406
1407  #[inline]
1408  pub fn to_array(self) -> [f32; 8] {
1409    cast(self)
1410  }
1411
1412  #[inline]
1413  pub fn as_array_ref(&self) -> &[f32; 8] {
1414    cast_ref(self)
1415  }
1416
1417  #[inline]
1418  pub fn as_array_mut(&mut self) -> &mut [f32; 8] {
1419    cast_mut(self)
1420  }
1421}
1422
1423impl Not for f32x8 {
1424  type Output = Self;
1425  #[inline]
1426  fn not(self) -> Self {
1427    pick! {
1428      if #[cfg(target_feature="avx")] {
1429        Self { avx: self.avx.not()  }
1430      } else {
1431        Self {
1432          a : self.a.not(),
1433          b : self.b.not(),
1434        }
1435      }
1436    }
1437  }
1438}