wide/
i16x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct i16x8 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct i16x8 { pub(crate) simd: v128 }
14
15    impl Default for i16x8 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for i16x8 {
22      fn eq(&self, other: &Self) -> bool {
23        u16x8_all_true(i16x8_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for i16x8 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct i16x8 { pub(crate) neon : int16x8_t }
33
34    impl Default for i16x8 {
35      #[inline]
36      #[must_use]
37      fn default() -> Self {
38        Self::splat(0)
39      }
40    }
41
42    impl PartialEq for i16x8 {
43      #[inline]
44      #[must_use]
45      fn eq(&self, other: &Self) -> bool {
46        unsafe { vminvq_u16(vceqq_s16(self.neon, other.neon))==u16::MAX }
47      }
48    }
49
50    impl Eq for i16x8 { }
51  } else {
52    #[derive(Default, Clone, Copy, PartialEq, Eq)]
53    #[repr(C, align(16))]
54    pub struct i16x8 { pub(crate) arr: [i16;8] }
55  }
56}
57
58int_uint_consts!(i16, 8, i16x8, i16x8, i16a8, const_i16_as_i16x8, 128);
59
60unsafe impl Zeroable for i16x8 {}
61unsafe impl Pod for i16x8 {}
62
63impl Add for i16x8 {
64  type Output = Self;
65  #[inline]
66  #[must_use]
67  fn add(self, rhs: Self) -> Self::Output {
68    pick! {
69      if #[cfg(target_feature="sse2")] {
70        Self { sse: add_i16_m128i(self.sse, rhs.sse) }
71      } else if #[cfg(target_feature="simd128")] {
72        Self { simd: i16x8_add(self.simd, rhs.simd) }
73      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
74        unsafe { Self { neon: vaddq_s16(self.neon, rhs.neon) } }
75      } else {
76        Self { arr: [
77          self.arr[0].wrapping_add(rhs.arr[0]),
78          self.arr[1].wrapping_add(rhs.arr[1]),
79          self.arr[2].wrapping_add(rhs.arr[2]),
80          self.arr[3].wrapping_add(rhs.arr[3]),
81          self.arr[4].wrapping_add(rhs.arr[4]),
82          self.arr[5].wrapping_add(rhs.arr[5]),
83          self.arr[6].wrapping_add(rhs.arr[6]),
84          self.arr[7].wrapping_add(rhs.arr[7]),
85        ]}
86      }
87    }
88  }
89}
90
91impl Sub for i16x8 {
92  type Output = Self;
93  #[inline]
94  #[must_use]
95  fn sub(self, rhs: Self) -> Self::Output {
96    pick! {
97      if #[cfg(target_feature="sse2")] {
98        Self { sse: sub_i16_m128i(self.sse, rhs.sse) }
99      } else if #[cfg(target_feature="simd128")] {
100        Self { simd: i16x8_sub(self.simd, rhs.simd) }
101      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102        unsafe {Self { neon: vsubq_s16(self.neon, rhs.neon) }}
103      } else {
104        Self { arr: [
105          self.arr[0].wrapping_sub(rhs.arr[0]),
106          self.arr[1].wrapping_sub(rhs.arr[1]),
107          self.arr[2].wrapping_sub(rhs.arr[2]),
108          self.arr[3].wrapping_sub(rhs.arr[3]),
109          self.arr[4].wrapping_sub(rhs.arr[4]),
110          self.arr[5].wrapping_sub(rhs.arr[5]),
111          self.arr[6].wrapping_sub(rhs.arr[6]),
112          self.arr[7].wrapping_sub(rhs.arr[7]),
113        ]}
114      }
115    }
116  }
117}
118
119impl Mul for i16x8 {
120  type Output = Self;
121  #[inline]
122  #[must_use]
123  fn mul(self, rhs: Self) -> Self::Output {
124    pick! {
125      if #[cfg(target_feature="sse2")] {
126        Self { sse: mul_i16_keep_low_m128i(self.sse, rhs.sse) }
127      } else if #[cfg(target_feature="simd128")] {
128        Self { simd: i16x8_mul(self.simd, rhs.simd) }
129      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
130        unsafe {Self { neon: vmulq_s16(self.neon, rhs.neon) }}
131      } else {
132        Self { arr: [
133          self.arr[0].wrapping_mul(rhs.arr[0]),
134          self.arr[1].wrapping_mul(rhs.arr[1]),
135          self.arr[2].wrapping_mul(rhs.arr[2]),
136          self.arr[3].wrapping_mul(rhs.arr[3]),
137          self.arr[4].wrapping_mul(rhs.arr[4]),
138          self.arr[5].wrapping_mul(rhs.arr[5]),
139          self.arr[6].wrapping_mul(rhs.arr[6]),
140          self.arr[7].wrapping_mul(rhs.arr[7]),
141        ]}
142      }
143    }
144  }
145}
146
147impl Add<i16> for i16x8 {
148  type Output = Self;
149  #[inline]
150  #[must_use]
151  fn add(self, rhs: i16) -> Self::Output {
152    self.add(Self::splat(rhs))
153  }
154}
155
156impl Sub<i16> for i16x8 {
157  type Output = Self;
158  #[inline]
159  #[must_use]
160  fn sub(self, rhs: i16) -> Self::Output {
161    self.sub(Self::splat(rhs))
162  }
163}
164
165impl Mul<i16> for i16x8 {
166  type Output = Self;
167  #[inline]
168  #[must_use]
169  fn mul(self, rhs: i16) -> Self::Output {
170    self.mul(Self::splat(rhs))
171  }
172}
173
174impl Add<i16x8> for i16 {
175  type Output = i16x8;
176  #[inline]
177  #[must_use]
178  fn add(self, rhs: i16x8) -> Self::Output {
179    i16x8::splat(self).add(rhs)
180  }
181}
182
183impl Sub<i16x8> for i16 {
184  type Output = i16x8;
185  #[inline]
186  #[must_use]
187  fn sub(self, rhs: i16x8) -> Self::Output {
188    i16x8::splat(self).sub(rhs)
189  }
190}
191
192impl Mul<i16x8> for i16 {
193  type Output = i16x8;
194  #[inline]
195  #[must_use]
196  fn mul(self, rhs: i16x8) -> Self::Output {
197    i16x8::splat(self).mul(rhs)
198  }
199}
200
201impl BitAnd for i16x8 {
202  type Output = Self;
203  #[inline]
204  #[must_use]
205  fn bitand(self, rhs: Self) -> Self::Output {
206    pick! {
207      if #[cfg(target_feature="sse2")] {
208        Self { sse: bitand_m128i(self.sse, rhs.sse) }
209      } else if #[cfg(target_feature="simd128")] {
210        Self { simd: v128_and(self.simd, rhs.simd) }
211      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
212        unsafe {Self { neon: vandq_s16(self.neon, rhs.neon) }}
213      } else {
214        Self { arr: [
215          self.arr[0].bitand(rhs.arr[0]),
216          self.arr[1].bitand(rhs.arr[1]),
217          self.arr[2].bitand(rhs.arr[2]),
218          self.arr[3].bitand(rhs.arr[3]),
219          self.arr[4].bitand(rhs.arr[4]),
220          self.arr[5].bitand(rhs.arr[5]),
221          self.arr[6].bitand(rhs.arr[6]),
222          self.arr[7].bitand(rhs.arr[7]),
223        ]}
224      }
225    }
226  }
227}
228
229impl BitOr for i16x8 {
230  type Output = Self;
231  #[inline]
232  #[must_use]
233  fn bitor(self, rhs: Self) -> Self::Output {
234    pick! {
235      if #[cfg(target_feature="sse2")] {
236        Self { sse: bitor_m128i(self.sse, rhs.sse) }
237      } else if #[cfg(target_feature="simd128")] {
238        Self { simd: v128_or(self.simd, rhs.simd) }
239      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
240        unsafe {Self { neon: vorrq_s16(self.neon, rhs.neon) }}
241      } else {
242        Self { arr: [
243          self.arr[0].bitor(rhs.arr[0]),
244          self.arr[1].bitor(rhs.arr[1]),
245          self.arr[2].bitor(rhs.arr[2]),
246          self.arr[3].bitor(rhs.arr[3]),
247          self.arr[4].bitor(rhs.arr[4]),
248          self.arr[5].bitor(rhs.arr[5]),
249          self.arr[6].bitor(rhs.arr[6]),
250          self.arr[7].bitor(rhs.arr[7]),
251        ]}
252      }
253    }
254  }
255}
256
257impl BitXor for i16x8 {
258  type Output = Self;
259  #[inline]
260  #[must_use]
261  fn bitxor(self, rhs: Self) -> Self::Output {
262    pick! {
263      if #[cfg(target_feature="sse2")] {
264        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
265      } else if #[cfg(target_feature="simd128")] {
266        Self { simd: v128_xor(self.simd, rhs.simd) }
267      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
268        unsafe {Self { neon: veorq_s16(self.neon, rhs.neon) }}
269      } else {
270        Self { arr: [
271          self.arr[0].bitxor(rhs.arr[0]),
272          self.arr[1].bitxor(rhs.arr[1]),
273          self.arr[2].bitxor(rhs.arr[2]),
274          self.arr[3].bitxor(rhs.arr[3]),
275          self.arr[4].bitxor(rhs.arr[4]),
276          self.arr[5].bitxor(rhs.arr[5]),
277          self.arr[6].bitxor(rhs.arr[6]),
278          self.arr[7].bitxor(rhs.arr[7]),
279        ]}
280      }
281    }
282  }
283}
284
285macro_rules! impl_shl_t_for_i16x8 {
286  ($($shift_type:ty),+ $(,)?) => {
287    $(impl Shl<$shift_type> for i16x8 {
288      type Output = Self;
289      /// Shifts all lanes by the value given.
290      #[inline]
291      #[must_use]
292      fn shl(self, rhs: $shift_type) -> Self::Output {
293        pick! {
294          if #[cfg(target_feature="sse2")] {
295            let shift = cast([rhs as u64, 0]);
296            Self { sse: shl_all_u16_m128i(self.sse, shift) }
297          } else if #[cfg(target_feature="simd128")] {
298            Self { simd: i16x8_shl(self.simd, rhs as u32) }
299          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
300            unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16(rhs as i16)) }}
301          } else {
302            let u = rhs as u64;
303            Self { arr: [
304              self.arr[0] << u,
305              self.arr[1] << u,
306              self.arr[2] << u,
307              self.arr[3] << u,
308              self.arr[4] << u,
309              self.arr[5] << u,
310              self.arr[6] << u,
311              self.arr[7] << u,
312            ]}
313          }
314        }
315      }
316    })+
317  };
318}
319impl_shl_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
320
321macro_rules! impl_shr_t_for_i16x8 {
322  ($($shift_type:ty),+ $(,)?) => {
323    $(impl Shr<$shift_type> for i16x8 {
324      type Output = Self;
325      /// Shifts all lanes by the value given.
326      #[inline]
327      #[must_use]
328      fn shr(self, rhs: $shift_type) -> Self::Output {
329        pick! {
330          if #[cfg(target_feature="sse2")] {
331            let shift = cast([rhs as u64, 0]);
332            Self { sse: shr_all_i16_m128i(self.sse, shift) }
333          } else if #[cfg(target_feature="simd128")] {
334            Self { simd: i16x8_shr(self.simd, rhs as u32) }
335          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
336            unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16( -(rhs as i16))) }}
337          } else {
338            let u = rhs as u64;
339            Self { arr: [
340              self.arr[0] >> u,
341              self.arr[1] >> u,
342              self.arr[2] >> u,
343              self.arr[3] >> u,
344              self.arr[4] >> u,
345              self.arr[5] >> u,
346              self.arr[6] >> u,
347              self.arr[7] >> u,
348            ]}
349          }
350        }
351      }
352    })+
353  };
354}
355impl_shr_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
356
357impl CmpEq for i16x8 {
358  type Output = Self;
359  #[inline]
360  #[must_use]
361  fn cmp_eq(self, rhs: Self) -> Self::Output {
362    pick! {
363      if #[cfg(target_feature="sse2")] {
364        Self { sse: cmp_eq_mask_i16_m128i(self.sse, rhs.sse) }
365      } else if #[cfg(target_feature="simd128")] {
366        Self { simd: i16x8_eq(self.simd, rhs.simd) }
367      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
368        unsafe {Self { neon: vreinterpretq_s16_u16(vceqq_s16(self.neon, rhs.neon)) }}
369      } else {
370        Self { arr: [
371          if self.arr[0] == rhs.arr[0] { -1 } else { 0 },
372          if self.arr[1] == rhs.arr[1] { -1 } else { 0 },
373          if self.arr[2] == rhs.arr[2] { -1 } else { 0 },
374          if self.arr[3] == rhs.arr[3] { -1 } else { 0 },
375          if self.arr[4] == rhs.arr[4] { -1 } else { 0 },
376          if self.arr[5] == rhs.arr[5] { -1 } else { 0 },
377          if self.arr[6] == rhs.arr[6] { -1 } else { 0 },
378          if self.arr[7] == rhs.arr[7] { -1 } else { 0 },
379        ]}
380      }
381    }
382  }
383}
384
385impl CmpGt for i16x8 {
386  type Output = Self;
387  #[inline]
388  #[must_use]
389  fn cmp_gt(self, rhs: Self) -> Self::Output {
390    pick! {
391      if #[cfg(target_feature="sse2")] {
392        Self { sse: cmp_gt_mask_i16_m128i(self.sse, rhs.sse) }
393      } else if #[cfg(target_feature="simd128")] {
394        Self { simd: i16x8_gt(self.simd, rhs.simd) }
395      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396        unsafe {Self { neon: vreinterpretq_s16_u16(vcgtq_s16(self.neon, rhs.neon)) }}
397      } else {
398        Self { arr: [
399          if self.arr[0] > rhs.arr[0] { -1 } else { 0 },
400          if self.arr[1] > rhs.arr[1] { -1 } else { 0 },
401          if self.arr[2] > rhs.arr[2] { -1 } else { 0 },
402          if self.arr[3] > rhs.arr[3] { -1 } else { 0 },
403          if self.arr[4] > rhs.arr[4] { -1 } else { 0 },
404          if self.arr[5] > rhs.arr[5] { -1 } else { 0 },
405          if self.arr[6] > rhs.arr[6] { -1 } else { 0 },
406          if self.arr[7] > rhs.arr[7] { -1 } else { 0 },
407        ]}
408      }
409    }
410  }
411}
412
413impl CmpLt for i16x8 {
414  type Output = Self;
415  #[inline]
416  #[must_use]
417  fn cmp_lt(self, rhs: Self) -> Self::Output {
418    pick! {
419      if #[cfg(target_feature="sse2")] {
420        Self { sse: cmp_lt_mask_i16_m128i(self.sse, rhs.sse) }
421      } else if #[cfg(target_feature="simd128")] {
422        Self { simd: i16x8_lt(self.simd, rhs.simd) }
423      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
424        unsafe {Self { neon: vreinterpretq_s16_u16(vcltq_s16(self.neon, rhs.neon)) }}
425      } else {
426        Self { arr: [
427          if self.arr[0] < rhs.arr[0] { -1 } else { 0 },
428          if self.arr[1] < rhs.arr[1] { -1 } else { 0 },
429          if self.arr[2] < rhs.arr[2] { -1 } else { 0 },
430          if self.arr[3] < rhs.arr[3] { -1 } else { 0 },
431          if self.arr[4] < rhs.arr[4] { -1 } else { 0 },
432          if self.arr[5] < rhs.arr[5] { -1 } else { 0 },
433          if self.arr[6] < rhs.arr[6] { -1 } else { 0 },
434          if self.arr[7] < rhs.arr[7] { -1 } else { 0 },
435        ]}
436      }
437    }
438  }
439}
440
441impl i16x8 {
442  #[inline]
443  #[must_use]
444  pub fn new(array: [i16; 8]) -> Self {
445    Self::from(array)
446  }
447
448  #[inline]
449  #[must_use]
450  pub fn move_mask(self) -> i32 {
451    pick! {
452      if #[cfg(target_feature="sse2")] {
453        move_mask_i8_m128i( pack_i16_to_i8_m128i(self.sse,self.sse)) & 0xff
454      } else if #[cfg(target_feature="simd128")] {
455        i16x8_bitmask(self.simd) as i32
456      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
457        unsafe
458        {
459          // set all to 1 if top bit is set, else 0
460          let masked = vcltq_s16(self.neon, vdupq_n_s16(0));
461
462          // select the right bit out of each lane
463          let selectbit : uint16x8_t = core::intrinsics::transmute([1u16, 2, 4, 8, 16, 32, 64, 128]);
464          let r = vandq_u16(masked, selectbit);
465
466          // horizontally add the 16-bit lanes
467          vaddvq_u16(r) as i32
468         }
469       } else {
470        ((self.arr[0] < 0) as i32) << 0 |
471        ((self.arr[1] < 0) as i32) << 1 |
472        ((self.arr[2] < 0) as i32) << 2 |
473        ((self.arr[3] < 0) as i32) << 3 |
474        ((self.arr[4] < 0) as i32) << 4 |
475        ((self.arr[5] < 0) as i32) << 5 |
476        ((self.arr[6] < 0) as i32) << 6 |
477        ((self.arr[7] < 0) as i32) << 7
478      }
479    }
480  }
481
482  #[inline]
483  #[must_use]
484  pub fn any(self) -> bool {
485    pick! {
486      if #[cfg(target_feature="sse2")] {
487        (move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0
488      } else if #[cfg(target_feature="simd128")] {
489        u16x8_bitmask(self.simd) != 0
490      } else {
491        let v : [u64;2] = cast(self);
492        ((v[0] | v[1]) & 0x8000800080008000) != 0
493      }
494    }
495  }
496
497  #[inline]
498  #[must_use]
499  pub fn all(self) -> bool {
500    pick! {
501      if #[cfg(target_feature="sse2")] {
502        (move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010
503      } else if #[cfg(target_feature="simd128")] {
504        u16x8_bitmask(self.simd) == 0b11111111
505      } else {
506        let v : [u64;2] = cast(self);
507        (v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000
508      }
509    }
510  }
511
512  #[inline]
513  #[must_use]
514  pub fn none(self) -> bool {
515    !self.any()
516  }
517
518  /// Unpack the lower half of the input and expand it to `i16` values.
519  #[inline]
520  pub fn from_u8x16_low(u: u8x16) -> Self {
521    pick! {
522      if #[cfg(target_feature="sse2")] {
523        Self{ sse: unpack_low_i8_m128i(u.sse, m128i::zeroed()) }
524      } else {
525        let u_arr: [u8; 16] = cast(u);
526        cast([
527          u_arr[0] as u16 as i16,
528          u_arr[1] as u16 as i16,
529          u_arr[2] as u16 as i16,
530          u_arr[3] as u16 as i16,
531          u_arr[4] as u16 as i16,
532          u_arr[5] as u16 as i16,
533          u_arr[6] as u16 as i16,
534          u_arr[7] as u16 as i16,
535        ])
536      }
537    }
538  }
539
540  /// returns low i16 of i32, saturating values that are too large
541  #[inline]
542  #[must_use]
543  pub fn from_i32x8_saturate(v: i32x8) -> Self {
544    pick! {
545      if #[cfg(target_feature="avx2")] {
546        i16x8 { sse: pack_i32_to_i16_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2))  }
547      } else if #[cfg(target_feature="sse2")] {
548        i16x8 { sse: pack_i32_to_i16_m128i( v.a.sse, v.b.sse ) }
549      } else if #[cfg(target_feature="simd128")] {
550        use core::arch::wasm32::*;
551
552        i16x8 { simd: i16x8_narrow_i32x4(v.a.simd, v.b.simd) }
553      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
554        use core::arch::aarch64::*;
555
556        unsafe {
557          i16x8 { neon: vcombine_s16(vqmovn_s32(v.a.neon), vqmovn_s32(v.b.neon)) }
558        }
559      } else {
560        fn clamp(a : i32) -> i16 {
561            if a < i16::MIN as i32 {
562                i16::MIN
563            }
564            else if a > i16::MAX as i32 {
565                i16::MAX
566            } else {
567                a as i16
568            }
569        }
570
571        i16x8::new([
572          clamp(v.as_array_ref()[0]),
573          clamp(v.as_array_ref()[1]),
574          clamp(v.as_array_ref()[2]),
575          clamp(v.as_array_ref()[3]),
576          clamp(v.as_array_ref()[4]),
577          clamp(v.as_array_ref()[5]),
578          clamp(v.as_array_ref()[6]),
579          clamp(v.as_array_ref()[7]),
580        ])
581      }
582    }
583  }
584
585  /// returns low i16 of i32, truncating the upper bits if they are set
586  #[inline]
587  #[must_use]
588  pub fn from_i32x8_truncate(v: i32x8) -> Self {
589    pick! {
590      if #[cfg(target_feature="avx2")] {
591        let a = v.avx2.bitand(set_splat_i32_m256i(0xffff));
592        i16x8 { sse: pack_i32_to_u16_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a) ) }
593      } else if #[cfg(target_feature="sse2")] {
594        let a = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.a.sse));
595        let b = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.b.sse));
596
597        i16x8 { sse: pack_i32_to_i16_m128i( a, b)  }
598      } else {
599      i16x8::new([
600        v.as_array_ref()[0] as i16,
601        v.as_array_ref()[1] as i16,
602        v.as_array_ref()[2] as i16,
603        v.as_array_ref()[3] as i16,
604        v.as_array_ref()[4] as i16,
605        v.as_array_ref()[5] as i16,
606        v.as_array_ref()[6] as i16,
607        v.as_array_ref()[7] as i16,
608      ])
609      }
610    }
611  }
612
613  #[inline]
614  #[must_use]
615  pub fn from_slice_unaligned(input: &[i16]) -> Self {
616    assert!(input.len() >= 8);
617
618    pick! {
619      if #[cfg(target_feature="sse2")] {
620        unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } }
621      } else if #[cfg(target_feature="simd128")] {
622        unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } }
623      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
624        unsafe { Self { neon: vld1q_s16( input.as_ptr() as *const i16 ) } }
625      } else {
626        // 2018 edition doesn't have try_into
627        unsafe { Self::new( *(input.as_ptr() as * const [i16;8]) ) }
628      }
629    }
630  }
631
632  #[inline]
633  #[must_use]
634  pub fn blend(self, t: Self, f: Self) -> Self {
635    pick! {
636      if #[cfg(target_feature="sse4.1")] {
637        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
638      } else if #[cfg(target_feature="simd128")] {
639        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
640      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
641        unsafe {Self { neon: vbslq_s16(vreinterpretq_u16_s16(self.neon), t.neon, f.neon) }}
642      } else {
643        generic_bit_blend(self, t, f)
644      }
645    }
646  }
647  #[inline]
648  #[must_use]
649  pub fn is_negative(self) -> Self {
650    self.cmp_lt(Self::zeroed())
651  }
652
653  /// horizontal add of all the elements of the vector
654  #[inline]
655  #[must_use]
656  pub fn reduce_add(self) -> i16 {
657    let arr: [i16; 8] = cast(self);
658
659    (arr[0].wrapping_add(arr[1]).wrapping_add(arr[2].wrapping_add(arr[3])))
660      .wrapping_add(
661        arr[4].wrapping_add(arr[5]).wrapping_add(arr[6].wrapping_add(arr[7])),
662      )
663  }
664
665  /// horizontal min of all the elements of the vector
666  #[inline]
667  #[must_use]
668  pub fn reduce_min(self) -> i16 {
669    let arr: [i16; 8] = cast(self);
670
671    (arr[0].min(arr[1]).min(arr[2].min(arr[3])))
672      .min(arr[4].min(arr[5]).min(arr[6].min(arr[7])))
673  }
674
675  /// horizontal max of all the elements of the vector
676  #[inline]
677  #[must_use]
678  pub fn reduce_max(self) -> i16 {
679    let arr: [i16; 8] = cast(self);
680
681    (arr[0].max(arr[1]).max(arr[2].max(arr[3])))
682      .max(arr[4].max(arr[5]).max(arr[6].max(arr[7])))
683  }
684
685  #[inline]
686  #[must_use]
687  pub fn abs(self) -> Self {
688    pick! {
689      if #[cfg(target_feature="sse2")] {
690        let mask = shr_imm_i16_m128i::<15>(self.sse);
691        Self { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
692      } else if #[cfg(target_feature="ssse3")] {
693        Self { sse: abs_i16_m128i(self.sse) }
694      } else if #[cfg(target_feature="simd128")] {
695        Self { simd: i16x8_abs(self.simd) }
696      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
697        unsafe {Self { neon: vabsq_s16(self.neon) }}
698      } else {
699        self.is_negative().blend(self.neg(), self)
700      }
701    }
702  }
703  #[inline]
704  #[must_use]
705  pub fn max(self, rhs: Self) -> Self {
706    pick! {
707      if #[cfg(target_feature="sse2")] {
708        Self { sse: max_i16_m128i(self.sse, rhs.sse) }
709      } else if #[cfg(target_feature="simd128")] {
710        Self { simd: i16x8_max(self.simd, rhs.simd) }
711      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
712        unsafe {Self { neon: vmaxq_s16(self.neon, rhs.neon) }}
713      } else {
714        self.cmp_lt(rhs).blend(rhs, self)
715      }
716    }
717  }
718  #[inline]
719  #[must_use]
720  pub fn min(self, rhs: Self) -> Self {
721    pick! {
722      if #[cfg(target_feature="sse2")] {
723        Self { sse: min_i16_m128i(self.sse, rhs.sse) }
724      } else if #[cfg(target_feature="simd128")] {
725        Self { simd: i16x8_min(self.simd, rhs.simd) }
726      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
727        unsafe {Self { neon: vminq_s16(self.neon, rhs.neon) }}
728      } else {
729        self.cmp_lt(rhs).blend(self, rhs)
730      }
731    }
732  }
733
734  #[inline]
735  #[must_use]
736  pub fn saturating_add(self, rhs: Self) -> Self {
737    pick! {
738      if #[cfg(target_feature="sse2")] {
739        Self { sse: add_saturating_i16_m128i(self.sse, rhs.sse) }
740      } else if #[cfg(target_feature="simd128")] {
741        Self { simd: i16x8_add_sat(self.simd, rhs.simd) }
742      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
743        unsafe {Self { neon: vqaddq_s16(self.neon, rhs.neon) }}
744      } else {
745        Self { arr: [
746          self.arr[0].saturating_add(rhs.arr[0]),
747          self.arr[1].saturating_add(rhs.arr[1]),
748          self.arr[2].saturating_add(rhs.arr[2]),
749          self.arr[3].saturating_add(rhs.arr[3]),
750          self.arr[4].saturating_add(rhs.arr[4]),
751          self.arr[5].saturating_add(rhs.arr[5]),
752          self.arr[6].saturating_add(rhs.arr[6]),
753          self.arr[7].saturating_add(rhs.arr[7]),
754        ]}
755      }
756    }
757  }
758  #[inline]
759  #[must_use]
760  pub fn saturating_sub(self, rhs: Self) -> Self {
761    pick! {
762      if #[cfg(target_feature="sse2")] {
763        Self { sse: sub_saturating_i16_m128i(self.sse, rhs.sse) }
764      } else if #[cfg(target_feature="simd128")] {
765        Self { simd: i16x8_sub_sat(self.simd, rhs.simd) }
766      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
767        unsafe { Self { neon: vqsubq_s16(self.neon, rhs.neon) } }
768      } else {
769        Self { arr: [
770          self.arr[0].saturating_sub(rhs.arr[0]),
771          self.arr[1].saturating_sub(rhs.arr[1]),
772          self.arr[2].saturating_sub(rhs.arr[2]),
773          self.arr[3].saturating_sub(rhs.arr[3]),
774          self.arr[4].saturating_sub(rhs.arr[4]),
775          self.arr[5].saturating_sub(rhs.arr[5]),
776          self.arr[6].saturating_sub(rhs.arr[6]),
777          self.arr[7].saturating_sub(rhs.arr[7]),
778        ]}
779      }
780    }
781  }
782
783  /// Calculates partial dot product.
784  /// Multiplies packed signed 16-bit integers, producing intermediate signed
785  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
786  /// integers.
787  #[inline]
788  #[must_use]
789  pub fn dot(self, rhs: Self) -> i32x4 {
790    pick! {
791      if #[cfg(target_feature="sse2")] {
792        i32x4 { sse:  mul_i16_horizontal_add_m128i(self.sse, rhs.sse) }
793      } else if #[cfg(target_feature="simd128")] {
794        i32x4 { simd: i32x4_dot_i16x8(self.simd, rhs.simd) }
795      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
796        unsafe {
797          let pl = vmull_s16(vget_low_s16(self.neon),  vget_low_s16(rhs.neon));
798          let ph = vmull_high_s16(self.neon, rhs.neon);
799          i32x4 { neon: vpaddq_s32(pl, ph) }
800        }
801      } else {
802        i32x4 { arr: [
803          (i32::from(self.arr[0]) * i32::from(rhs.arr[0])) + (i32::from(self.arr[1]) * i32::from(rhs.arr[1])),
804          (i32::from(self.arr[2]) * i32::from(rhs.arr[2])) + (i32::from(self.arr[3]) * i32::from(rhs.arr[3])),
805          (i32::from(self.arr[4]) * i32::from(rhs.arr[4])) + (i32::from(self.arr[5]) * i32::from(rhs.arr[5])),
806          (i32::from(self.arr[6]) * i32::from(rhs.arr[6])) + (i32::from(self.arr[7]) * i32::from(rhs.arr[7])),
807        ] }
808      }
809    }
810  }
811
812  /// Multiply and scale equivilent to ((self * rhs) + 0x4000) >> 15 on each
813  /// lane, effectively multiplying by a 16 bit fixed point number between -1
814  /// and 1. This corresponds to the following instructions:
815  /// - vqrdmulhq_s16 instruction on neon
816  /// - i16x8_q15mulr_sat on simd128
817  /// - _mm_mulhrs_epi16 on ssse3
818  /// - emulated via mul_i16_* on sse2
819  #[inline]
820  #[must_use]
821  pub fn mul_scale_round(self, rhs: Self) -> Self {
822    pick! {
823      if #[cfg(target_feature="ssse3")] {
824        Self { sse:  mul_i16_scale_round_m128i(self.sse, rhs.sse) }
825      } else if #[cfg(target_feature="sse2")] {
826        // unfortunately mul_i16_scale_round_m128i only got added in sse3
827        let hi = mul_i16_keep_high_m128i(self.sse, rhs.sse);
828        let lo = mul_i16_keep_low_m128i(self.sse, rhs.sse);
829        let mut v1 = unpack_low_i16_m128i(lo, hi);
830        let mut v2 = unpack_high_i16_m128i(lo, hi);
831        let a = set_splat_i32_m128i(0x4000);
832        v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
833        v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
834        let s = pack_i32_to_i16_m128i(v1, v2);
835        Self { sse: s }
836      } else if #[cfg(target_feature="simd128")] {
837        Self { simd: i16x8_q15mulr_sat(self.simd, rhs.simd) }
838      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
839        unsafe { Self { neon: vqrdmulhq_s16(self.neon, rhs.neon) } }
840      } else {
841        // compiler does a surprisingly good job of vectorizing this
842        Self { arr: [
843          ((i32::from(self.arr[0]) * i32::from(rhs.arr[0]) + 0x4000) >> 15) as i16,
844          ((i32::from(self.arr[1]) * i32::from(rhs.arr[1]) + 0x4000) >> 15) as i16,
845          ((i32::from(self.arr[2]) * i32::from(rhs.arr[2]) + 0x4000) >> 15) as i16,
846          ((i32::from(self.arr[3]) * i32::from(rhs.arr[3]) + 0x4000) >> 15) as i16,
847          ((i32::from(self.arr[4]) * i32::from(rhs.arr[4]) + 0x4000) >> 15) as i16,
848          ((i32::from(self.arr[5]) * i32::from(rhs.arr[5]) + 0x4000) >> 15) as i16,
849          ((i32::from(self.arr[6]) * i32::from(rhs.arr[6]) + 0x4000) >> 15) as i16,
850          ((i32::from(self.arr[7]) * i32::from(rhs.arr[7]) + 0x4000) >> 15) as i16,
851        ]}
852      }
853    }
854  }
855
856  /// transpose matrix of 8x8 i16 matrix
857  #[must_use]
858  #[inline]
859  pub fn transpose(data: [i16x8; 8]) -> [i16x8; 8] {
860    pick! {
861      if #[cfg(target_feature="sse2")] {
862        let a1 = unpack_low_i16_m128i(data[0].sse, data[1].sse);
863        let a2 = unpack_high_i16_m128i(data[0].sse, data[1].sse);
864        let a3 = unpack_low_i16_m128i(data[2].sse, data[3].sse);
865        let a4 = unpack_high_i16_m128i(data[2].sse, data[3].sse);
866        let a5 = unpack_low_i16_m128i(data[4].sse, data[5].sse);
867        let a6 = unpack_high_i16_m128i(data[4].sse, data[5].sse);
868        let a7 = unpack_low_i16_m128i(data[6].sse, data[7].sse);
869        let a8 = unpack_high_i16_m128i(data[6].sse, data[7].sse);
870
871        let b1 = unpack_low_i32_m128i(a1, a3);
872        let b2 = unpack_high_i32_m128i(a1, a3);
873        let b3 = unpack_low_i32_m128i(a2, a4);
874        let b4 = unpack_high_i32_m128i(a2, a4);
875        let b5 = unpack_low_i32_m128i(a5, a7);
876        let b6 = unpack_high_i32_m128i(a5, a7);
877        let b7 = unpack_low_i32_m128i(a6, a8);
878        let b8 = unpack_high_i32_m128i(a6, a8);
879
880        [
881          i16x8 { sse: unpack_low_i64_m128i(b1, b5) },
882          i16x8 { sse: unpack_high_i64_m128i(b1, b5) },
883          i16x8 { sse: unpack_low_i64_m128i(b2, b6) },
884          i16x8 { sse: unpack_high_i64_m128i(b2, b6) },
885          i16x8 { sse: unpack_low_i64_m128i(b3, b7) },
886          i16x8 { sse: unpack_high_i64_m128i(b3, b7) },
887          i16x8 { sse: unpack_low_i64_m128i(b4, b8) },
888          i16x8 { sse: unpack_high_i64_m128i(b4, b8) } ,
889        ]
890     } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
891
892          #[inline] fn vtrq32(a : int16x8_t, b : int16x8_t) -> (int16x8_t, int16x8_t)
893          {
894              unsafe {
895                let r = vtrnq_s32(vreinterpretq_s32_s16(a),vreinterpretq_s32_s16(b));
896                (vreinterpretq_s16_s32(r.0), vreinterpretq_s16_s32(r.1))
897              }
898          }
899
900        unsafe {
901          let (q0,q2) = vtrq32(data[0].neon, data[2].neon);
902          let (q1,q3) = vtrq32(data[1].neon, data[3].neon);
903          let (q4,q6) = vtrq32(data[4].neon, data[6].neon);
904          let (q5,q7) = vtrq32(data[5].neon, data[7].neon);
905
906          let b1 = vtrnq_s16(q0, q1);
907          let b2 = vtrnq_s16(q2, q3);
908          let b3 = vtrnq_s16(q4, q5);
909          let b4 = vtrnq_s16(q6, q7);
910
911          // There is no vtrnq_s64 unfortunately, so there's this mess
912          // which does a somewhat reasonable job, but not as good as the
913          // assembly versions which just swap the 64 bit register aliases.
914          [
915            i16x8 { neon: vcombine_s16(vget_low_s16(b1.0), vget_low_s16(b3.0)) },
916            i16x8 { neon: vcombine_s16(vget_low_s16(b1.1), vget_low_s16(b3.1)) },
917            i16x8 { neon: vcombine_s16(vget_low_s16(b2.0), vget_low_s16(b4.0)) },
918            i16x8 { neon: vcombine_s16(vget_low_s16(b2.1), vget_low_s16(b4.1)) },
919            i16x8 { neon: vcombine_s16(vget_high_s16(b1.0), vget_high_s16(b3.0)) },
920            i16x8 { neon: vcombine_s16(vget_high_s16(b1.1), vget_high_s16(b3.1)) },
921            i16x8 { neon: vcombine_s16(vget_high_s16(b2.0), vget_high_s16(b4.0)) },
922            i16x8 { neon: vcombine_s16(vget_high_s16(b2.1), vget_high_s16(b4.1)) },
923          ]
924        }
925      } else if #[cfg(target_feature="simd128")] {
926        #[inline] fn lo_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a,b) }
927        #[inline] fn hi_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a,b) }
928        #[inline] fn lo_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a,b) }
929        #[inline] fn hi_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a,b) }
930        #[inline] fn lo_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<0, 2>(a,b) }
931        #[inline] fn hi_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<1, 3>(a,b) }
932
933        let a1 = lo_i16(data[0].simd, data[1].simd);
934        let a2 = hi_i16(data[0].simd, data[1].simd);
935        let a3 = lo_i16(data[2].simd, data[3].simd);
936        let a4 = hi_i16(data[2].simd, data[3].simd);
937        let a5 = lo_i16(data[4].simd, data[5].simd);
938        let a6 = hi_i16(data[4].simd, data[5].simd);
939        let a7 = lo_i16(data[6].simd, data[7].simd);
940        let a8 = hi_i16(data[6].simd, data[7].simd);
941
942        let b1 = lo_i32(a1, a3);
943        let b2 = hi_i32(a1, a3);
944        let b3 = lo_i32(a2, a4);
945        let b4 = hi_i32(a2, a4);
946        let b5 = lo_i32(a5, a7);
947        let b6 = hi_i32(a5, a7);
948        let b7 = lo_i32(a6, a8);
949        let b8 = hi_i32(a6, a8);
950
951        [
952          i16x8 { simd: lo_i64(b1, b5) },
953          i16x8 { simd: hi_i64(b1, b5) },
954          i16x8 { simd: lo_i64(b2, b6) },
955          i16x8 { simd: hi_i64(b2, b6) },
956          i16x8 { simd: lo_i64(b3, b7) },
957          i16x8 { simd: hi_i64(b3, b7) },
958          i16x8 { simd: lo_i64(b4, b8) },
959          i16x8 { simd: hi_i64(b4, b8) } ,
960        ]
961
962      } else {
963        #[inline(always)]
964        fn transpose_column(data: &[i16x8; 8], index: usize) -> i16x8 {
965          i16x8::new([
966            data[0].as_array_ref()[index],
967            data[1].as_array_ref()[index],
968            data[2].as_array_ref()[index],
969            data[3].as_array_ref()[index],
970            data[4].as_array_ref()[index],
971            data[5].as_array_ref()[index],
972            data[6].as_array_ref()[index],
973            data[7].as_array_ref()[index],
974          ])
975        }
976
977        [
978          transpose_column(&data, 0),
979          transpose_column(&data, 1),
980          transpose_column(&data, 2),
981          transpose_column(&data, 3),
982          transpose_column(&data, 4),
983          transpose_column(&data, 5),
984          transpose_column(&data, 6),
985          transpose_column(&data, 7),
986        ]
987      }
988    }
989  }
990
991  #[inline]
992  #[must_use]
993  /// Multiply and scale, equivalent to ((self * rhs) + 0x4000) >> 15 on each
994  /// lane, effectively multiplying by a 16 bit fixed point number between -1
995  /// and 1. This corresponds to the following instructions:
996  /// - vqrdmulhq_n_s16 instruction on neon
997  /// - i16x8_q15mulr_sat on simd128
998  /// - _mm_mulhrs_epi16 on ssse3
999  /// - emulated via mul_i16_* on sse2
1000  pub fn mul_scale_round_n(self, rhs: i16) -> Self {
1001    pick! {
1002      if #[cfg(target_feature="ssse3")] {
1003        Self { sse:  mul_i16_scale_round_m128i(self.sse, set_splat_i16_m128i(rhs)) }
1004      } else if #[cfg(target_feature="sse2")] {
1005        // unfortunately mul_i16_scale_round_m128i only got added in sse3
1006        let r = set_splat_i16_m128i(rhs);
1007        let hi = mul_i16_keep_high_m128i(self.sse, r);
1008        let lo = mul_i16_keep_low_m128i(self.sse, r);
1009        let mut v1 = unpack_low_i16_m128i(lo, hi);
1010        let mut v2 = unpack_high_i16_m128i(lo, hi);
1011        let a = set_splat_i32_m128i(0x4000);
1012        v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
1013        v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
1014        let s = pack_i32_to_i16_m128i(v1, v2);
1015        Self { sse: s }
1016      } else if #[cfg(target_feature="simd128")] {
1017        Self { simd: i16x8_q15mulr_sat(self.simd, i16x8_splat(rhs)) }
1018      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1019        unsafe { Self { neon: vqrdmulhq_n_s16(self.neon, rhs) } }
1020      } else {
1021        // compiler does a surprisingly good job of vectorizing this
1022        Self { arr: [
1023          ((i32::from(self.arr[0]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1024          ((i32::from(self.arr[1]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1025          ((i32::from(self.arr[2]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1026          ((i32::from(self.arr[3]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1027          ((i32::from(self.arr[4]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1028          ((i32::from(self.arr[5]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1029          ((i32::from(self.arr[6]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1030          ((i32::from(self.arr[7]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1031        ]}
1032      }
1033    }
1034  }
1035
1036  #[inline]
1037  pub fn to_array(self) -> [i16; 8] {
1038    cast(self)
1039  }
1040
1041  #[inline]
1042  pub fn as_array_ref(&self) -> &[i16; 8] {
1043    cast_ref(self)
1044  }
1045
1046  #[inline]
1047  pub fn as_array_mut(&mut self) -> &mut [i16; 8] {
1048    cast_mut(self)
1049  }
1050}