wide/
f32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(16))]
7    pub struct f32x4 { sse: m128 }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct f32x4 { simd: v128 }
14
15    impl Default for f32x4 {
16      fn default() -> Self {
17        Self::splat(0.0)
18      }
19    }
20
21    impl PartialEq for f32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(f32x4_eq(self.simd, other.simd))
24      }
25    }
26  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27    use core::arch::aarch64::*;
28    #[repr(C)]
29    #[derive(Copy, Clone)]
30    pub struct f32x4 { neon : float32x4_t }
31
32    impl Default for f32x4 {
33      #[inline]
34      #[must_use]
35      fn default() -> Self {
36        unsafe { Self { neon: vdupq_n_f32(0.0)} }
37      }
38    }
39
40    impl PartialEq for f32x4 {
41      #[inline]
42      #[must_use]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
45      }
46
47    }
48    } else {
49    #[derive(Default, Clone, Copy, PartialEq)]
50    #[repr(C, align(16))]
51    pub struct f32x4 { arr: [f32;4] }
52  }
53}
54
55macro_rules! const_f32_as_f32x4 {
56  ($i:ident, $f:expr) => {
57    pub const $i: f32x4 =
58      unsafe { ConstUnionHack128bit { f32a4: [$f; 4] }.f32x4 };
59  };
60}
61
62impl f32x4 {
63  const_f32_as_f32x4!(ONE, 1.0);
64  const_f32_as_f32x4!(ZERO, 0.0);
65  const_f32_as_f32x4!(HALF, 0.5);
66  const_f32_as_f32x4!(E, core::f32::consts::E);
67  const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
68  const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
69  const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
70  const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
71  const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
72  const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
73  const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
74  const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
75  const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
76  const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
77  const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
78  const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
79  const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
80  const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
81  const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
82  const_f32_as_f32x4!(PI, core::f32::consts::PI);
83  const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
84  const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
85}
86
87unsafe impl Zeroable for f32x4 {}
88unsafe impl Pod for f32x4 {}
89
90impl Add for f32x4 {
91  type Output = Self;
92  #[inline]
93  #[must_use]
94  fn add(self, rhs: Self) -> Self::Output {
95    pick! {
96      if #[cfg(target_feature="sse")] {
97        Self { sse: add_m128(self.sse, rhs.sse) }
98      } else if #[cfg(target_feature="simd128")] {
99        Self { simd: f32x4_add(self.simd, rhs.simd) }
100      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
101        unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
102      } else {
103        Self { arr: [
104          self.arr[0] + rhs.arr[0],
105          self.arr[1] + rhs.arr[1],
106          self.arr[2] + rhs.arr[2],
107          self.arr[3] + rhs.arr[3],
108        ]}
109      }
110    }
111  }
112}
113
114impl Sub for f32x4 {
115  type Output = Self;
116  #[inline]
117  #[must_use]
118  fn sub(self, rhs: Self) -> Self::Output {
119    pick! {
120      if #[cfg(target_feature="sse")] {
121        Self { sse: sub_m128(self.sse, rhs.sse) }
122      } else if #[cfg(target_feature="simd128")] {
123        Self { simd: f32x4_sub(self.simd, rhs.simd) }
124      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125        unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126      } else {
127        Self { arr: [
128          self.arr[0] - rhs.arr[0],
129          self.arr[1] - rhs.arr[1],
130          self.arr[2] - rhs.arr[2],
131          self.arr[3] - rhs.arr[3],
132        ]}
133      }
134    }
135  }
136}
137
138impl Mul for f32x4 {
139  type Output = Self;
140  #[inline]
141  #[must_use]
142  fn mul(self, rhs: Self) -> Self::Output {
143    pick! {
144      if #[cfg(target_feature="sse")] {
145        Self { sse: mul_m128(self.sse, rhs.sse) }
146      } else if #[cfg(target_feature="simd128")] {
147        Self { simd: f32x4_mul(self.simd, rhs.simd) }
148      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
149        unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
150      } else {
151        Self { arr: [
152          self.arr[0] * rhs.arr[0],
153          self.arr[1] * rhs.arr[1],
154          self.arr[2] * rhs.arr[2],
155          self.arr[3] * rhs.arr[3],
156        ]}
157      }
158    }
159  }
160}
161
162impl Div for f32x4 {
163  type Output = Self;
164  #[inline]
165  #[must_use]
166  fn div(self, rhs: Self) -> Self::Output {
167    pick! {
168      if #[cfg(target_feature="sse")] {
169        Self { sse: div_m128(self.sse, rhs.sse) }
170      } else if #[cfg(target_feature="simd128")] {
171        Self { simd: f32x4_div(self.simd, rhs.simd) }
172      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
173        unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
174      } else {
175        Self { arr: [
176          self.arr[0] / rhs.arr[0],
177          self.arr[1] / rhs.arr[1],
178          self.arr[2] / rhs.arr[2],
179          self.arr[3] / rhs.arr[3],
180        ]}
181      }
182    }
183  }
184}
185
186impl Add<f32> for f32x4 {
187  type Output = Self;
188  #[inline]
189  #[must_use]
190  fn add(self, rhs: f32) -> Self::Output {
191    self.add(Self::splat(rhs))
192  }
193}
194
195impl Sub<f32> for f32x4 {
196  type Output = Self;
197  #[inline]
198  #[must_use]
199  fn sub(self, rhs: f32) -> Self::Output {
200    self.sub(Self::splat(rhs))
201  }
202}
203
204impl Mul<f32> for f32x4 {
205  type Output = Self;
206  #[inline]
207  #[must_use]
208  fn mul(self, rhs: f32) -> Self::Output {
209    self.mul(Self::splat(rhs))
210  }
211}
212
213impl Div<f32> for f32x4 {
214  type Output = Self;
215  #[inline]
216  #[must_use]
217  fn div(self, rhs: f32) -> Self::Output {
218    self.div(Self::splat(rhs))
219  }
220}
221
222impl Add<f32x4> for f32 {
223  type Output = f32x4;
224  #[inline]
225  #[must_use]
226  fn add(self, rhs: f32x4) -> Self::Output {
227    f32x4::splat(self).add(rhs)
228  }
229}
230
231impl Sub<f32x4> for f32 {
232  type Output = f32x4;
233  #[inline]
234  #[must_use]
235  fn sub(self, rhs: f32x4) -> Self::Output {
236    f32x4::splat(self).sub(rhs)
237  }
238}
239
240impl Mul<f32x4> for f32 {
241  type Output = f32x4;
242  #[inline]
243  #[must_use]
244  fn mul(self, rhs: f32x4) -> Self::Output {
245    f32x4::splat(self).mul(rhs)
246  }
247}
248
249impl Div<f32x4> for f32 {
250  type Output = f32x4;
251  #[inline]
252  #[must_use]
253  fn div(self, rhs: f32x4) -> Self::Output {
254    f32x4::splat(self).div(rhs)
255  }
256}
257
258impl BitAnd for f32x4 {
259  type Output = Self;
260  #[inline]
261  #[must_use]
262  fn bitand(self, rhs: Self) -> Self::Output {
263    pick! {
264      if #[cfg(target_feature="sse")] {
265        Self { sse: bitand_m128(self.sse, rhs.sse) }
266      } else if #[cfg(target_feature="simd128")] {
267        Self { simd: v128_and(self.simd, rhs.simd) }
268      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
269        unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
270      } else {
271        Self { arr: [
272          f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
273          f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
274          f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
275          f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
276        ]}
277      }
278    }
279  }
280}
281
282impl BitOr for f32x4 {
283  type Output = Self;
284  #[inline]
285  #[must_use]
286  fn bitor(self, rhs: Self) -> Self::Output {
287    pick! {
288      if #[cfg(target_feature="sse")] {
289        Self { sse: bitor_m128(self.sse, rhs.sse) }
290      } else if #[cfg(target_feature="simd128")] {
291        Self { simd: v128_or(self.simd, rhs.simd) }
292      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
293        unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
294      } else {
295        Self { arr: [
296          f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
297          f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
298          f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
299          f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
300        ]}
301      }
302    }
303  }
304}
305
306impl BitXor for f32x4 {
307  type Output = Self;
308  #[inline]
309  #[must_use]
310  fn bitxor(self, rhs: Self) -> Self::Output {
311    pick! {
312      if #[cfg(target_feature="sse")] {
313        Self { sse: bitxor_m128(self.sse, rhs.sse) }
314      } else if #[cfg(target_feature="simd128")] {
315        Self { simd: v128_xor(self.simd, rhs.simd) }
316      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
317        unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
318      } else {
319        Self { arr: [
320          f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
321          f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
322          f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
323          f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
324        ]}
325      }
326    }
327  }
328}
329
330impl CmpEq for f32x4 {
331  type Output = Self;
332  #[inline]
333  #[must_use]
334  fn cmp_eq(self, rhs: Self) -> Self::Output {
335    pick! {
336      if #[cfg(target_feature="sse")] {
337        Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
338      } else if #[cfg(target_feature="simd128")] {
339        Self { simd: f32x4_eq(self.simd, rhs.simd) }
340      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
341        unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
342      } else {
343        Self { arr: [
344          if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
345          if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
346          if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
347          if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
348        ]}
349      }
350    }
351  }
352}
353
354impl CmpGe for f32x4 {
355  type Output = Self;
356  #[inline]
357  #[must_use]
358  fn cmp_ge(self, rhs: Self) -> Self::Output {
359    pick! {
360      if #[cfg(target_feature="sse")] {
361        Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
362      } else if #[cfg(target_feature="simd128")] {
363        Self { simd: f32x4_ge(self.simd, rhs.simd) }
364      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
365        unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
366      } else {
367        Self { arr: [
368          if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
369          if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
370          if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
371          if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
372        ]}
373      }
374    }
375  }
376}
377
378impl CmpGt for f32x4 {
379  type Output = Self;
380  #[inline]
381  #[must_use]
382  fn cmp_gt(self, rhs: Self) -> Self::Output {
383    pick! {
384      if #[cfg(target_feature="sse")] {
385        Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
386      } else if #[cfg(target_feature="simd128")] {
387        Self { simd: f32x4_gt(self.simd, rhs.simd) }
388      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
389        unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
390      } else {
391        Self { arr: [
392          if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
393          if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
394          if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
395          if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
396        ]}
397      }
398    }
399  }
400}
401
402impl CmpNe for f32x4 {
403  type Output = Self;
404  #[inline]
405  #[must_use]
406  fn cmp_ne(self, rhs: Self) -> Self::Output {
407    pick! {
408      if #[cfg(target_feature="sse")] {
409        Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
410      } else if #[cfg(target_feature="simd128")] {
411        Self { simd: f32x4_ne(self.simd, rhs.simd) }
412      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
413        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
414      } else {
415        Self { arr: [
416          if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
417          if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
418          if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
419          if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
420        ]}
421      }
422    }
423  }
424}
425
426impl CmpLe for f32x4 {
427  type Output = Self;
428  #[inline]
429  #[must_use]
430  fn cmp_le(self, rhs: Self) -> Self::Output {
431    pick! {
432      if #[cfg(target_feature="sse")] {
433        Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
434      } else if #[cfg(target_feature="simd128")] {
435        Self { simd: f32x4_le(self.simd, rhs.simd) }
436      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
437        unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
438      } else {
439        Self { arr: [
440          if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
441          if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
442          if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
443          if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
444        ]}
445      }
446    }
447  }
448}
449
450impl CmpLt for f32x4 {
451  type Output = Self;
452  #[inline]
453  #[must_use]
454  fn cmp_lt(self, rhs: Self) -> Self::Output {
455    pick! {
456      if #[cfg(target_feature="sse")] {
457        Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
458      } else if #[cfg(target_feature="simd128")] {
459        Self { simd: f32x4_lt(self.simd, rhs.simd) }
460      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
461        unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
462      } else {
463        Self { arr: [
464          if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
465          if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
466          if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
467          if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
468        ]}
469      }
470    }
471  }
472}
473
474impl f32x4 {
475  #[inline]
476  #[must_use]
477  pub fn new(array: [f32; 4]) -> Self {
478    Self::from(array)
479  }
480
481  #[inline]
482  #[must_use]
483  pub fn blend(self, t: Self, f: Self) -> Self {
484    pick! {
485      if #[cfg(target_feature="sse4.1")] {
486        Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
487      } else if #[cfg(target_feature="simd128")] {
488        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
489      } else {
490        generic_bit_blend(self, t, f)
491      }
492    }
493  }
494  #[inline]
495  #[must_use]
496  pub fn abs(self) -> Self {
497    pick! {
498      if #[cfg(target_feature="simd128")] {
499        Self { simd: f32x4_abs(self.simd) }
500      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
501        unsafe {Self { neon: vabsq_f32(self.neon) }}
502      } else {
503        let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
504        self & non_sign_bits
505      }
506    }
507  }
508
509  /// Calculates the lanewise maximum of both vectors. This is a faster
510  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
511  /// involved.
512  #[inline]
513  #[must_use]
514  pub fn fast_max(self, rhs: Self) -> Self {
515    pick! {
516      if #[cfg(target_feature="sse")] {
517        Self { sse: max_m128(self.sse, rhs.sse) }
518      } else if #[cfg(target_feature="simd128")] {
519        Self {
520          simd: f32x4_pmax(self.simd, rhs.simd),
521        }
522      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
523        unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
524      } else {
525        Self { arr: [
526          if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
527          if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
528          if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
529          if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
530        ]}
531      }
532    }
533  }
534
535  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
536  /// the other lane gets chosen. Use `fast_max` for a faster implementation
537  /// that doesn't handle NaNs.
538  #[inline]
539  #[must_use]
540  pub fn max(self, rhs: Self) -> Self {
541    pick! {
542      if #[cfg(target_feature="sse")] {
543        // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN
544        // involved, it chooses rhs, so we need to specifically check rhs for
545        // NaN.
546        rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
547      } else if #[cfg(target_feature="simd128")] {
548        // WASM has two max intrinsics:
549        // - max: This propagates NaN, that's the opposite of what we need.
550        // - pmax: This is defined as self < rhs ? rhs : self, which basically
551        //   chooses self if either is NaN.
552        //
553        // pmax is what we want, but we need to specifically check self for NaN.
554        Self {
555          simd: v128_bitselect(
556            rhs.simd,
557            f32x4_pmax(self.simd, rhs.simd),
558            f32x4_ne(self.simd, self.simd), // NaN check
559          )
560        }
561      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
562        unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
563      } else {
564        Self { arr: [
565          self.arr[0].max(rhs.arr[0]),
566          self.arr[1].max(rhs.arr[1]),
567          self.arr[2].max(rhs.arr[2]),
568          self.arr[3].max(rhs.arr[3]),
569        ]}
570      }
571    }
572  }
573
574  /// Calculates the lanewise minimum of both vectors. This is a faster
575  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
576  /// involved.
577  #[inline]
578  #[must_use]
579  pub fn fast_min(self, rhs: Self) -> Self {
580    pick! {
581      if #[cfg(target_feature="sse")] {
582        Self { sse: min_m128(self.sse, rhs.sse) }
583      } else if #[cfg(target_feature="simd128")] {
584        Self {
585          simd: f32x4_pmin(self.simd, rhs.simd),
586        }
587      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
588        unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
589      } else {
590        Self { arr: [
591          if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
592          if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
593          if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
594          if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
595        ]}
596      }
597    }
598  }
599
600  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
601  /// the other lane gets chosen. Use `fast_min` for a faster implementation
602  /// that doesn't handle NaNs.
603  #[inline]
604  #[must_use]
605  pub fn min(self, rhs: Self) -> Self {
606    pick! {
607      if #[cfg(target_feature="sse")] {
608        // min_m128 seems to do self < rhs ? self : rhs. So if there's any NaN
609        // involved, it chooses rhs, so we need to specifically check rhs for
610        // NaN.
611        rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
612      } else if #[cfg(target_feature="simd128")] {
613        // WASM has two min intrinsics:
614        // - min: This propagates NaN, that's the opposite of what we need.
615        // - pmin: This is defined as rhs < self ? rhs : self, which basically
616        //   chooses self if either is NaN.
617        //
618        // pmin is what we want, but we need to specifically check self for NaN.
619        Self {
620          simd: v128_bitselect(
621            rhs.simd,
622            f32x4_pmin(self.simd, rhs.simd),
623            f32x4_ne(self.simd, self.simd), // NaN check
624          )
625        }
626      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
627        unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
628      } else {
629        Self { arr: [
630          self.arr[0].min(rhs.arr[0]),
631          self.arr[1].min(rhs.arr[1]),
632          self.arr[2].min(rhs.arr[2]),
633          self.arr[3].min(rhs.arr[3]),
634        ]}
635      }
636    }
637  }
638  #[inline]
639  #[must_use]
640  pub fn is_nan(self) -> Self {
641    pick! {
642      if #[cfg(target_feature="sse")] {
643        Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
644      } else if #[cfg(target_feature="simd128")] {
645        Self { simd: f32x4_ne(self.simd, self.simd) }
646      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
647        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
648      } else {
649        Self { arr: [
650          if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
651          if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
652          if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
653          if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
654        ]}
655      }
656    }
657  }
658  #[inline]
659  #[must_use]
660  pub fn is_finite(self) -> Self {
661    let shifted_exp_mask = u32x4::from(0xFF000000);
662    let u: u32x4 = cast(self);
663    let shift_u = u << 1_u64;
664    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
665    cast(out)
666  }
667  #[inline]
668  #[must_use]
669  pub fn is_inf(self) -> Self {
670    let shifted_inf = u32x4::from(0xFF000000);
671    let u: u32x4 = cast(self);
672    let shift_u = u << 1_u64;
673    let out = (shift_u).cmp_eq(shifted_inf);
674    cast(out)
675  }
676
677  #[inline]
678  #[must_use]
679  pub fn round(self) -> Self {
680    pick! {
681      if #[cfg(target_feature="sse4.1")] {
682        Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
683      } else if #[cfg(target_feature="sse2")] {
684        let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
685        let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
686        let i: i32x4 = cast(mi);
687        let mask: f32x4 = cast(i.cmp_eq(i32x4::from(0x80000000_u32 as i32)));
688        mask.blend(self, f)
689      } else if #[cfg(target_feature="simd128")] {
690        Self { simd: f32x4_nearest(self.simd) }
691      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
692        unsafe {Self { neon: vrndnq_f32(self.neon) }}
693      } else {
694        // Note(Lokathor): This software fallback is probably very slow compared
695        // to having a hardware option available, even just the sse2 version is
696        // better than this. Oh well.
697        let to_int = f32x4::from(1.0 / f32::EPSILON);
698        let u: u32x4 = cast(self);
699        let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
700        let mut y: f32x4;
701
702        let no_op_magic = i32x4::from(0x7f + 23);
703        let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
704        let no_op_val: f32x4 = self;
705
706        let zero_magic = i32x4::from(0x7f - 1);
707        let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic));
708        let zero_val: f32x4 = self * f32x4::from(0.0);
709
710        let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default()));
711        let x: f32x4 = neg_bit.blend(-self, self);
712        y = x + to_int - to_int - x;
713        y = y.cmp_gt(f32x4::from(0.5)).blend(
714          y + x - f32x4::from(-1.0),
715          y.cmp_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
716        );
717        y = neg_bit.blend(-y, y);
718
719        no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
720      }
721    }
722  }
723
724  /// Rounds each lane into an integer. This is a faster implementation than
725  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
726  /// values you get implementation defined behavior.
727  #[inline]
728  #[must_use]
729  pub fn fast_round_int(self) -> i32x4 {
730    pick! {
731      if #[cfg(target_feature="sse2")] {
732        cast(convert_to_i32_m128i_from_m128(self.sse))
733      } else {
734        self.round_int()
735      }
736    }
737  }
738
739  /// Rounds each lane into an integer. This saturates out of range values and
740  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
741  /// doesn't handle out of range values or NaNs.
742  #[inline]
743  #[must_use]
744  pub fn round_int(self) -> i32x4 {
745    pick! {
746      if #[cfg(target_feature="sse2")] {
747        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
748        let non_nan_mask = self.cmp_eq(self);
749        let non_nan = self & non_nan_mask;
750        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
751        let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
752        flip_to_max ^ cast
753      } else if #[cfg(target_feature="simd128")] {
754        cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
755      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
756        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
757      } else {
758        let rounded: [f32; 4] = cast(self.round());
759        cast([
760          rounded[0] as i32,
761          rounded[1] as i32,
762          rounded[2] as i32,
763          rounded[3] as i32,
764        ])
765      }
766    }
767  }
768
769  /// Truncates each lane into an integer. This is a faster implementation than
770  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
771  /// values you get implementation defined behavior.
772  #[inline]
773  #[must_use]
774  pub fn fast_trunc_int(self) -> i32x4 {
775    pick! {
776      if #[cfg(target_feature="sse2")] {
777        cast(truncate_m128_to_m128i(self.sse))
778      } else {
779        self.trunc_int()
780      }
781    }
782  }
783
784  /// Truncates each lane into an integer. This saturates out of range values
785  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
786  /// that doesn't handle out of range values or NaNs.
787  #[inline]
788  #[must_use]
789  pub fn trunc_int(self) -> i32x4 {
790    pick! {
791      if #[cfg(target_feature="sse2")] {
792        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
793        let non_nan_mask = self.cmp_eq(self);
794        let non_nan = self & non_nan_mask;
795        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
796        let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
797        flip_to_max ^ cast
798      } else if #[cfg(target_feature="simd128")] {
799        cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
800      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
801        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
802      } else {
803        let n: [f32;4] = cast(self);
804        cast([
805          n[0] as i32,
806          n[1] as i32,
807          n[2] as i32,
808          n[3] as i32,
809        ])
810      }
811    }
812  }
813  #[inline]
814  #[must_use]
815  pub fn mul_add(self, m: Self, a: Self) -> Self {
816    pick! {
817      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
818        Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
819      } else {
820        (self * m) + a
821      }
822    }
823  }
824
825  #[inline]
826  #[must_use]
827  pub fn mul_sub(self, m: Self, s: Self) -> Self {
828    pick! {
829      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
830        Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
831      } else {
832        (self * m) - s
833      }
834    }
835  }
836
837  #[inline]
838  #[must_use]
839  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
840    pick! {
841      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
842        Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
843      } else {
844        a - (self * m)
845      }
846    }
847  }
848
849  #[inline]
850  #[must_use]
851  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
852    pick! {
853      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
854        Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, a.sse) }
855      } else {
856        -(self * m) - a
857      }
858    }
859  }
860
861  #[inline]
862  #[must_use]
863  pub fn flip_signs(self, signs: Self) -> Self {
864    self ^ (signs & Self::from(-0.0))
865  }
866
867  #[inline]
868  #[must_use]
869  pub fn copysign(self, sign: Self) -> Self {
870    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
871    (self & magnitude_mask) | (sign & Self::from(-0.0))
872  }
873
874  #[allow(non_upper_case_globals)]
875  #[inline]
876  pub fn asin_acos(self) -> (Self, Self) {
877    // Based on the Agner Fog "vector class library":
878    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
879    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
880    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
881    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
882    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
883    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
884
885    let xa = self.abs();
886    let big = xa.cmp_ge(f32x4::splat(0.5));
887
888    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
889    let x2 = xa * xa;
890    let x3 = big.blend(x1, x2);
891
892    let xb = x1.sqrt();
893
894    let x4 = big.blend(xb, xa);
895
896    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
897    let z = z.mul_add(x3 * x4, x4);
898
899    let z1 = z + z;
900
901    // acos
902    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
903    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
904    let acos = big.blend(z3, z4);
905
906    // asin
907    let z3 = f32x4::FRAC_PI_2 - z1;
908    let asin = big.blend(z3, z);
909    let asin = asin.flip_signs(self);
910
911    (asin, acos)
912  }
913
914  #[allow(non_upper_case_globals)]
915  #[inline]
916  pub fn asin(self) -> Self {
917    // Based on the Agner Fog "vector class library":
918    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
919    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
920    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
921    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
922    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
923    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
924
925    let xa = self.abs();
926    let big = xa.cmp_ge(f32x4::splat(0.5));
927
928    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
929    let x2 = xa * xa;
930    let x3 = big.blend(x1, x2);
931
932    let xb = x1.sqrt();
933
934    let x4 = big.blend(xb, xa);
935
936    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
937    let z = z.mul_add(x3 * x4, x4);
938
939    let z1 = z + z;
940
941    // asin
942    let z3 = f32x4::FRAC_PI_2 - z1;
943    let asin = big.blend(z3, z);
944    let asin = asin.flip_signs(self);
945
946    asin
947  }
948
949  #[inline]
950  #[must_use]
951  #[allow(non_upper_case_globals)]
952  pub fn acos(self) -> Self {
953    // Based on the Agner Fog "vector class library":
954    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
955    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
956    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
957    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
958    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
959    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
960
961    let xa = self.abs();
962    let big = xa.cmp_ge(f32x4::splat(0.5));
963
964    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
965    let x2 = xa * xa;
966    let x3 = big.blend(x1, x2);
967
968    let xb = x1.sqrt();
969
970    let x4 = big.blend(xb, xa);
971
972    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
973    let z = z.mul_add(x3 * x4, x4);
974
975    let z1 = z + z;
976
977    // acos
978    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
979    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
980    let acos = big.blend(z3, z4);
981
982    acos
983  }
984
985  #[allow(non_upper_case_globals)]
986  #[inline]
987  pub fn atan(self) -> Self {
988    // Based on the Agner Fog "vector class library":
989    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
990    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
991    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
992    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
993    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
994
995    let t = self.abs();
996
997    // small:  z = t / 1.0;
998    // medium: z = (t-1.0) / (t+1.0);
999    // big:    z = -1.0 / t;
1000    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1001    let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
1002
1003    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1004    s = notsmal & s;
1005
1006    let mut a = notbig & t;
1007    a = notsmal.blend(a - Self::ONE, a);
1008    let mut b = notbig & Self::ONE;
1009    b = notsmal.blend(b + t, b);
1010    let z = a / b;
1011
1012    let zz = z * z;
1013
1014    // Taylor expansion
1015    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1016    re = re.mul_add(zz * z, z) + s;
1017
1018    // get sign bit
1019    re = (self.sign_bit()).blend(-re, re);
1020
1021    re
1022  }
1023
1024  #[allow(non_upper_case_globals)]
1025  #[inline]
1026  pub fn atan2(self, x: Self) -> Self {
1027    // Based on the Agner Fog "vector class library":
1028    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1029    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1030    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1031    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1032    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1033
1034    let y = self;
1035
1036    // move in first octant
1037    let x1 = x.abs();
1038    let y1 = y.abs();
1039    let swapxy = y1.cmp_gt(x1);
1040    // swap x and y if y1 > x1
1041    let mut x2 = swapxy.blend(y1, x1);
1042    let mut y2 = swapxy.blend(x1, y1);
1043
1044    // check for special case: x and y are both +/- INF
1045    let both_infinite = x.is_inf() & y.is_inf();
1046    if both_infinite.any() {
1047      let minus_one = -Self::ONE;
1048      x2 = both_infinite.blend(x2 & minus_one, x2);
1049      y2 = both_infinite.blend(y2 & minus_one, y2);
1050    }
1051
1052    // x = y = 0 will produce NAN. No problem, fixed below
1053    let t = y2 / x2;
1054
1055    // small:  z = t / 1.0;
1056    // medium: z = (t-1.0) / (t+1.0);
1057    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1058
1059    let a = notsmal.blend(t - Self::ONE, t);
1060    let b = notsmal.blend(t + Self::ONE, Self::ONE);
1061    let s = notsmal & Self::FRAC_PI_4;
1062    let z = a / b;
1063
1064    let zz = z * z;
1065
1066    // Taylor expansion
1067    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1068    re = re.mul_add(zz * z, z) + s;
1069
1070    // move back in place
1071    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1072    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1073    re = (x.sign_bit()).blend(Self::PI - re, re);
1074
1075    // get sign bit
1076    re = (y.sign_bit()).blend(-re, re);
1077
1078    re
1079  }
1080
1081  #[inline]
1082  #[must_use]
1083  #[allow(non_upper_case_globals)]
1084  pub fn sin_cos(self) -> (Self, Self) {
1085    // Based on the Agner Fog "vector class library":
1086    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1087
1088    const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1089    const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1090    const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1091
1092    const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1093    const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1094    const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1095
1096    const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1097    const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1098    const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1099
1100    const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1101
1102    let xa = self.abs();
1103
1104    // Find quadrant
1105    let y = (xa * TWO_OVER_PI).round();
1106    let q: i32x4 = y.round_int();
1107
1108    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1109
1110    let x2 = x * x;
1111    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1112    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1113      + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1114
1115    let swap = !(q & i32x4::from(1)).cmp_eq(i32x4::from(0));
1116
1117    let mut overflow: f32x4 = cast(q.cmp_gt(i32x4::from(0x2000000)));
1118    overflow &= xa.is_finite();
1119    s = overflow.blend(f32x4::from(0.0), s);
1120    c = overflow.blend(f32x4::from(1.0), c);
1121
1122    // calc sin
1123    let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1124    let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1125    sin1 = sin1.flip_signs(cast(sign_sin));
1126
1127    // calc cos
1128    let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1129    let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1130    cos1 ^= cast::<_, f32x4>(sign_cos);
1131
1132    (sin1, cos1)
1133  }
1134
1135  #[inline]
1136  #[must_use]
1137  pub fn sin(self) -> Self {
1138    let (s, _) = self.sin_cos();
1139    s
1140  }
1141  #[inline]
1142  #[must_use]
1143  pub fn cos(self) -> Self {
1144    let (_, c) = self.sin_cos();
1145    c
1146  }
1147  #[inline]
1148  #[must_use]
1149  pub fn tan(self) -> Self {
1150    let (s, c) = self.sin_cos();
1151    s / c
1152  }
1153  #[inline]
1154  #[must_use]
1155  pub fn to_degrees(self) -> Self {
1156    const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1157    self * RAD_TO_DEG_RATIO
1158  }
1159  #[inline]
1160  #[must_use]
1161  pub fn to_radians(self) -> Self {
1162    const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1163    self * DEG_TO_RAD_RATIO
1164  }
1165  #[inline]
1166  #[must_use]
1167  pub fn recip(self) -> Self {
1168    pick! {
1169      if #[cfg(target_feature="sse")] {
1170        Self { sse: reciprocal_m128(self.sse) }
1171      } else if #[cfg(target_feature="simd128")] {
1172        Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1173      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1174        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1175      } else {
1176        Self { arr: [
1177          1.0 / self.arr[0],
1178          1.0 / self.arr[1],
1179          1.0 / self.arr[2],
1180          1.0 / self.arr[3],
1181        ]}
1182      }
1183    }
1184  }
1185  #[inline]
1186  #[must_use]
1187  pub fn recip_sqrt(self) -> Self {
1188    pick! {
1189      if #[cfg(target_feature="sse")] {
1190        Self { sse: reciprocal_sqrt_m128(self.sse) }
1191      } else if #[cfg(target_feature="simd128")] {
1192        Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1193      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1194        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1195      } else if #[cfg(feature="std")] {
1196        Self { arr: [
1197          1.0 / self.arr[0].sqrt(),
1198          1.0 / self.arr[1].sqrt(),
1199          1.0 / self.arr[2].sqrt(),
1200          1.0 / self.arr[3].sqrt(),
1201        ]}
1202      } else {
1203        Self { arr: [
1204          1.0 / software_sqrt(self.arr[0] as f64) as f32,
1205          1.0 / software_sqrt(self.arr[1] as f64) as f32,
1206          1.0 / software_sqrt(self.arr[2] as f64) as f32,
1207          1.0 / software_sqrt(self.arr[3] as f64) as f32,
1208        ]}
1209      }
1210    }
1211  }
1212  #[inline]
1213  #[must_use]
1214  pub fn sqrt(self) -> Self {
1215    pick! {
1216      if #[cfg(target_feature="sse")] {
1217        Self { sse: sqrt_m128(self.sse) }
1218      } else if #[cfg(target_feature="simd128")] {
1219        Self { simd: f32x4_sqrt(self.simd) }
1220      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1221        unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1222      } else if #[cfg(feature="std")] {
1223        Self { arr: [
1224          self.arr[0].sqrt(),
1225          self.arr[1].sqrt(),
1226          self.arr[2].sqrt(),
1227          self.arr[3].sqrt(),
1228        ]}
1229      } else {
1230        Self { arr: [
1231          software_sqrt(self.arr[0] as f64) as f32,
1232          software_sqrt(self.arr[1] as f64) as f32,
1233          software_sqrt(self.arr[2] as f64) as f32,
1234          software_sqrt(self.arr[3] as f64) as f32,
1235        ]}
1236      }
1237    }
1238  }
1239
1240  #[inline]
1241  #[must_use]
1242  pub fn move_mask(self) -> i32 {
1243    pick! {
1244      if #[cfg(target_feature="sse")] {
1245        move_mask_m128(self.sse)
1246      } else if #[cfg(target_feature="simd128")] {
1247        u32x4_bitmask(self.simd) as i32
1248      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1249        unsafe
1250        {
1251          // set all to 1 if top bit is set, else 0
1252          let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1253
1254          // select the right bit out of each lane
1255          let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]);
1256          let r = vandq_u32(masked, selectbit);
1257
1258          // horizontally add the 16-bit lanes
1259          vaddvq_u32(r) as i32
1260        }
1261      } else {
1262        (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 |
1263        (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 |
1264        (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 |
1265        (((self.arr[3].to_bits() as i32) < 0) as i32) << 3
1266      }
1267    }
1268  }
1269  #[inline]
1270  #[must_use]
1271  pub fn any(self) -> bool {
1272    pick! {
1273      if #[cfg(target_feature="simd128")] {
1274        v128_any_true(self.simd)
1275      } else {
1276        self.move_mask() != 0
1277      }
1278    }
1279  }
1280  #[inline]
1281  #[must_use]
1282  pub fn all(self) -> bool {
1283    pick! {
1284      if #[cfg(target_feature="simd128")] {
1285        u32x4_all_true(self.simd)
1286      } else {
1287        // four lanes
1288        self.move_mask() == 0b1111
1289      }
1290    }
1291  }
1292  #[inline]
1293  #[must_use]
1294  pub fn none(self) -> bool {
1295    !self.any()
1296  }
1297
1298  #[inline]
1299  #[allow(non_upper_case_globals)]
1300  fn vm_pow2n(self) -> Self {
1301    const_f32_as_f32x4!(pow2_23, 8388608.0);
1302    const_f32_as_f32x4!(bias, 127.0);
1303    let a = self + (bias + pow2_23);
1304    let c = cast::<_, i32x4>(a) << 23;
1305    cast::<_, f32x4>(c)
1306  }
1307
1308  /// Calculate the exponent of a packed f32x4
1309  #[inline]
1310  #[must_use]
1311  #[allow(non_upper_case_globals)]
1312  pub fn exp(self) -> Self {
1313    const_f32_as_f32x4!(P0, 1.0 / 2.0);
1314    const_f32_as_f32x4!(P1, 1.0 / 6.0);
1315    const_f32_as_f32x4!(P2, 1. / 24.);
1316    const_f32_as_f32x4!(P3, 1. / 120.);
1317    const_f32_as_f32x4!(P4, 1. / 720.);
1318    const_f32_as_f32x4!(P5, 1. / 5040.);
1319    const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1320    const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1321    let max_x = f32x4::from(87.3);
1322    let r = (self * Self::LOG2_E).round();
1323    let x = r.mul_neg_add(LN2D_HI, self);
1324    let x = r.mul_neg_add(LN2D_LO, x);
1325    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1326    let x2 = x * x;
1327    let z = z.mul_add(x2, x);
1328    let n2 = Self::vm_pow2n(r);
1329    let z = (z + Self::ONE) * n2;
1330    // check for overflow
1331    let in_range = self.abs().cmp_lt(max_x);
1332    let in_range = in_range & self.is_finite();
1333    in_range.blend(z, Self::ZERO)
1334  }
1335
1336  #[inline]
1337  #[allow(non_upper_case_globals)]
1338  fn exponent(self) -> f32x4 {
1339    const_f32_as_f32x4!(pow2_23, 8388608.0);
1340    const_f32_as_f32x4!(bias, 127.0);
1341    let a = cast::<_, u32x4>(self);
1342    let b = a >> 23;
1343    let c = b | cast::<_, u32x4>(pow2_23);
1344    let d = cast::<_, f32x4>(c);
1345    let e = d - (pow2_23 + bias);
1346    e
1347  }
1348
1349  #[inline]
1350  #[allow(non_upper_case_globals)]
1351  fn fraction_2(self) -> Self {
1352    let t1 = cast::<_, u32x4>(self);
1353    let t2 = cast::<_, u32x4>(
1354      (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1355    );
1356    cast::<_, f32x4>(t2)
1357  }
1358  #[inline]
1359  fn is_zero_or_subnormal(self) -> Self {
1360    let t = cast::<_, i32x4>(self);
1361    let t = t & i32x4::splat(0x7F800000);
1362    i32x4::round_float(t.cmp_eq(i32x4::splat(0)))
1363  }
1364  #[inline]
1365  fn infinity() -> Self {
1366    cast::<_, f32x4>(i32x4::splat(0x7F800000))
1367  }
1368  #[inline]
1369  fn nan_log() -> Self {
1370    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1371  }
1372  #[inline]
1373  fn nan_pow() -> Self {
1374    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1375  }
1376  #[inline]
1377  pub fn sign_bit(self) -> Self {
1378    let t1 = cast::<_, i32x4>(self);
1379    let t2 = t1 >> 31;
1380    !cast::<_, f32x4>(t2).cmp_eq(f32x4::ZERO)
1381  }
1382
1383  /// horizontal add of all the elements of the vector
1384  #[inline]
1385  #[must_use]
1386  pub fn reduce_add(self) -> f32 {
1387    let arr: [f32; 4] = cast(self);
1388    arr.iter().sum()
1389  }
1390
1391  /// Natural log (ln(x))
1392  #[inline]
1393  #[must_use]
1394  #[allow(non_upper_case_globals)]
1395  pub fn ln(self) -> Self {
1396    const_f32_as_f32x4!(HALF, 0.5);
1397    const_f32_as_f32x4!(P0, 3.3333331174E-1);
1398    const_f32_as_f32x4!(P1, -2.4999993993E-1);
1399    const_f32_as_f32x4!(P2, 2.0000714765E-1);
1400    const_f32_as_f32x4!(P3, -1.6668057665E-1);
1401    const_f32_as_f32x4!(P4, 1.4249322787E-1);
1402    const_f32_as_f32x4!(P5, -1.2420140846E-1);
1403    const_f32_as_f32x4!(P6, 1.1676998740E-1);
1404    const_f32_as_f32x4!(P7, -1.1514610310E-1);
1405    const_f32_as_f32x4!(P8, 7.0376836292E-2);
1406    const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1407    const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1408    const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1409
1410    let x1 = self;
1411    let x = Self::fraction_2(x1);
1412    let e = Self::exponent(x1);
1413    let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1414    let x = (!mask).blend(x + x, x);
1415    let fe = mask.blend(e + Self::ONE, e);
1416    let x = x - Self::ONE;
1417    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1418    let x2 = x * x;
1419    let res = x2 * x * res;
1420    let res = fe.mul_add(LN2F_LO, res);
1421    let res = res + x2.mul_neg_add(HALF, x);
1422    let res = fe.mul_add(LN2F_HI, res);
1423    let overflow = !self.is_finite();
1424    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1425    let mask = overflow | underflow;
1426    if !mask.any() {
1427      res
1428    } else {
1429      let is_zero = self.is_zero_or_subnormal();
1430      let res = underflow.blend(Self::nan_log(), res);
1431      let res = is_zero.blend(Self::infinity(), res);
1432      let res = overflow.blend(self, res);
1433      res
1434    }
1435  }
1436
1437  #[inline]
1438  #[must_use]
1439  pub fn log2(self) -> Self {
1440    Self::ln(self) * Self::LOG2_E
1441  }
1442  #[inline]
1443  #[must_use]
1444  pub fn log10(self) -> Self {
1445    Self::ln(self) * Self::LOG10_E
1446  }
1447
1448  #[inline]
1449  #[must_use]
1450  #[allow(non_upper_case_globals)]
1451  pub fn pow_f32x4(self, y: f32x4) -> Self {
1452    const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1453    const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1454    const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1455    const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1456    const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1457    const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1458    const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1459    const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1460    const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1461    const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1462    const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1463
1464    const_f32_as_f32x4!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1465    const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1466    const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1467    const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1468    const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1469    const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1470
1471    let x1 = self.abs();
1472    let x = x1.fraction_2();
1473
1474    let mask = x.cmp_gt(f32x4::SQRT_2 * f32x4::HALF);
1475    let x = (!mask).blend(x + x, x);
1476
1477    let x = x - f32x4::ONE;
1478    let x2 = x * x;
1479    let lg1 = polynomial_8!(
1480      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1481    );
1482    let lg1 = lg1 * x2 * x;
1483
1484    let ef = x1.exponent();
1485    let ef = mask.blend(ef + f32x4::ONE, ef);
1486
1487    let e1 = (ef * y).round();
1488    let yr = ef.mul_sub(y, e1);
1489
1490    let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1491    let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1492    let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1493
1494    let e2 = (lg * y * f32x4::LOG2_E).round();
1495    let v = lg.mul_sub(y, e2 * ln2f_hi);
1496    let v = e2.mul_neg_add(ln2f_lo, v);
1497    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1498
1499    let x = v;
1500    let e3 = (x * f32x4::LOG2_E).round();
1501    let x = e3.mul_neg_add(f32x4::LN_2, x);
1502    let x2 = x * x;
1503    let z = x2.mul_add(
1504      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1505      x + f32x4::ONE,
1506    );
1507
1508    let ee = e1 + e2 + e3;
1509    let ei = cast::<_, i32x4>(ee.round_int());
1510    let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1511
1512    let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF)))
1513      | (ee.cmp_gt(f32x4::splat(300.0)));
1514    let underflow = cast::<_, f32x4>(ej.cmp_lt(i32x4::splat(0x000)))
1515      | (ee.cmp_lt(f32x4::splat(-300.0)));
1516
1517    // Add exponent by integer addition
1518    let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1519
1520    // Check for overflow/underflow
1521    let z = if (overflow | underflow).any() {
1522      let z = underflow.blend(f32x4::ZERO, z);
1523      overflow.blend(Self::infinity(), z)
1524    } else {
1525      z
1526    };
1527
1528    // Check for self == 0
1529    let x_zero = self.is_zero_or_subnormal();
1530    let z = x_zero.blend(
1531      y.cmp_lt(f32x4::ZERO).blend(
1532        Self::infinity(),
1533        y.cmp_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1534      ),
1535      z,
1536    );
1537
1538    let x_sign = self.sign_bit();
1539    let z = if x_sign.any() {
1540      // Y into an integer
1541      let yi = y.cmp_eq(y.round());
1542      // Is y odd?
1543      let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1544
1545      let z1 =
1546        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1547      x_sign.blend(z1, z)
1548    } else {
1549      z
1550    };
1551
1552    let x_finite = self.is_finite();
1553    let y_finite = y.is_finite();
1554    let e_finite = ee.is_finite();
1555    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1556      return z;
1557    }
1558
1559    (self.is_nan() | y.is_nan()).blend(self + y, z)
1560  }
1561
1562  #[inline]
1563  pub fn powf(self, y: f32) -> Self {
1564    Self::pow_f32x4(self, f32x4::splat(y))
1565  }
1566
1567  #[inline]
1568  pub fn to_array(self) -> [f32; 4] {
1569    cast(self)
1570  }
1571
1572  #[inline]
1573  pub fn as_array_ref(&self) -> &[f32; 4] {
1574    cast_ref(self)
1575  }
1576
1577  #[inline]
1578  pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
1579    cast_mut(self)
1580  }
1581}
wide/f32x4_.rs

wide/
f32x4_.rs