wide/
i32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(32))]
7    pub struct i32x8 { pub(crate) avx2: m256i }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq, Eq)]
10    #[repr(C, align(32))]
11    pub struct i32x8 { pub(crate) a : i32x4, pub(crate) b : i32x4}
12  }
13}
14
15int_uint_consts!(i32, 8, i32x8, i32x8, i32a8, const_i32_as_i32x8, 256);
16
17unsafe impl Zeroable for i32x8 {}
18unsafe impl Pod for i32x8 {}
19
20impl Add for i32x8 {
21  type Output = Self;
22  #[inline]
23  #[must_use]
24  fn add(self, rhs: Self) -> Self::Output {
25    pick! {
26      if #[cfg(target_feature="avx2")] {
27        Self { avx2: add_i32_m256i(self.avx2, rhs.avx2) }
28      } else {
29        Self {
30          a : self.a.add(rhs.a),
31          b : self.b.add(rhs.b),
32        }
33      }
34    }
35  }
36}
37
38impl Sub for i32x8 {
39  type Output = Self;
40  #[inline]
41  #[must_use]
42  fn sub(self, rhs: Self) -> Self::Output {
43    pick! {
44      if #[cfg(target_feature="avx2")] {
45        Self { avx2: sub_i32_m256i(self.avx2, rhs.avx2) }
46      } else {
47        Self {
48          a : self.a.sub(rhs.a),
49          b : self.b.sub(rhs.b),
50        }
51      }
52    }
53  }
54}
55
56impl Mul for i32x8 {
57  type Output = Self;
58  #[inline]
59  #[must_use]
60  fn mul(self, rhs: Self) -> Self::Output {
61    pick! {
62      if #[cfg(target_feature="avx2")] {
63        Self { avx2: mul_i32_keep_low_m256i(self.avx2, rhs.avx2) }
64      } else {
65        Self {
66          a : self.a.mul(rhs.a),
67          b : self.b.mul(rhs.b),
68        }
69      }
70    }
71  }
72}
73
74impl Add<i32> for i32x8 {
75  type Output = Self;
76  #[inline]
77  #[must_use]
78  fn add(self, rhs: i32) -> Self::Output {
79    self.add(Self::splat(rhs))
80  }
81}
82
83impl Sub<i32> for i32x8 {
84  type Output = Self;
85  #[inline]
86  #[must_use]
87  fn sub(self, rhs: i32) -> Self::Output {
88    self.sub(Self::splat(rhs))
89  }
90}
91
92impl Mul<i32> for i32x8 {
93  type Output = Self;
94  #[inline]
95  #[must_use]
96  fn mul(self, rhs: i32) -> Self::Output {
97    self.mul(Self::splat(rhs))
98  }
99}
100
101impl Add<i32x8> for i32 {
102  type Output = i32x8;
103  #[inline]
104  #[must_use]
105  fn add(self, rhs: i32x8) -> Self::Output {
106    i32x8::splat(self) + rhs
107  }
108}
109
110impl Sub<i32x8> for i32 {
111  type Output = i32x8;
112  #[inline]
113  #[must_use]
114  fn sub(self, rhs: i32x8) -> Self::Output {
115    i32x8::splat(self) - rhs
116  }
117}
118
119impl Mul<i32x8> for i32 {
120  type Output = i32x8;
121  #[inline]
122  #[must_use]
123  fn mul(self, rhs: i32x8) -> Self::Output {
124    i32x8::splat(self) * rhs
125  }
126}
127
128impl BitAnd for i32x8 {
129  type Output = Self;
130  #[inline]
131  #[must_use]
132  fn bitand(self, rhs: Self) -> Self::Output {
133    pick! {
134      if #[cfg(target_feature="avx2")] {
135        Self { avx2: bitand_m256i(self.avx2, rhs.avx2) }
136      } else {
137        Self {
138          a : self.a.bitand(rhs.a),
139          b : self.b.bitand(rhs.b),
140        }
141      }
142    }
143  }
144}
145
146impl BitOr for i32x8 {
147  type Output = Self;
148  #[inline]
149  #[must_use]
150  fn bitor(self, rhs: Self) -> Self::Output {
151    pick! {
152    if #[cfg(target_feature="avx2")] {
153      Self { avx2: bitor_m256i(self.avx2, rhs.avx2) }
154    } else {
155      Self {
156        a : self.a.bitor(rhs.a),
157        b : self.b.bitor(rhs.b),
158      }
159    }    }
160  }
161}
162
163impl BitXor for i32x8 {
164  type Output = Self;
165  #[inline]
166  #[must_use]
167  fn bitxor(self, rhs: Self) -> Self::Output {
168    pick! {
169      if #[cfg(target_feature="avx2")] {
170        Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) }
171      } else {
172        Self {
173          a : self.a.bitxor(rhs.a),
174          b : self.b.bitxor(rhs.b),
175        }
176      }
177    }
178  }
179}
180
181macro_rules! impl_shl_t_for_i32x8 {
182  ($($shift_type:ty),+ $(,)?) => {
183    $(impl Shl<$shift_type> for i32x8 {
184      type Output = Self;
185      /// Shifts all lanes by the value given.
186      #[inline]
187      #[must_use]
188      fn shl(self, rhs: $shift_type) -> Self::Output {
189        pick! {
190          if #[cfg(target_feature="avx2")] {
191            let shift = cast([rhs as u64, 0]);
192            Self { avx2: shl_all_u32_m256i(self.avx2, shift) }
193          } else {
194            Self {
195              a : self.a.shl(rhs),
196              b : self.b.shl(rhs),
197            }
198          }
199        }
200      }
201    })+
202  };
203}
204impl_shl_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
205
206macro_rules! impl_shr_t_for_i32x8 {
207  ($($shift_type:ty),+ $(,)?) => {
208    $(impl Shr<$shift_type> for i32x8 {
209      type Output = Self;
210      /// Shifts all lanes by the value given.
211      #[inline]
212      #[must_use]
213      fn shr(self, rhs: $shift_type) -> Self::Output {
214        pick! {
215          if #[cfg(target_feature="avx2")] {
216            let shift = cast([rhs as u64, 0]);
217            Self { avx2: shr_all_i32_m256i(self.avx2, shift) }
218          } else {
219            Self {
220              a : self.a.shr(rhs),
221              b : self.b.shr(rhs),
222            }
223          }
224        }
225      }
226    })+
227  };
228}
229
230impl_shr_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
231
232impl CmpEq for i32x8 {
233  type Output = Self;
234  #[inline]
235  #[must_use]
236  fn cmp_eq(self, rhs: Self) -> Self::Output {
237    pick! {
238      if #[cfg(target_feature="avx2")] {
239        Self { avx2: cmp_eq_mask_i32_m256i(self.avx2, rhs.avx2) }
240      } else {
241        Self {
242          a : self.a.cmp_eq(rhs.a),
243          b : self.b.cmp_eq(rhs.b),
244        }
245      }
246    }
247  }
248}
249
250impl CmpGt for i32x8 {
251  type Output = Self;
252  #[inline]
253  #[must_use]
254  fn cmp_gt(self, rhs: Self) -> Self::Output {
255    pick! {
256      if #[cfg(target_feature="avx2")] {
257        Self { avx2: cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2) }
258      } else {
259        Self {
260          a : self.a.cmp_gt(rhs.a),
261          b : self.b.cmp_gt(rhs.b),
262        }
263      }
264    }
265  }
266}
267
268impl CmpLt for i32x8 {
269  type Output = Self;
270  #[inline]
271  #[must_use]
272  fn cmp_lt(self, rhs: Self) -> Self::Output {
273    pick! {
274      if #[cfg(target_feature="avx2")] {
275        Self { avx2: !cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2)  ^ cmp_eq_mask_i32_m256i(self.avx2,rhs.avx2) }
276      } else {
277        Self {
278          a : self.a.cmp_lt(rhs.a),
279          b : self.b.cmp_lt(rhs.b),
280        }
281      }
282    }
283  }
284}
285impl i32x8 {
286  #[inline]
287  #[must_use]
288  pub fn new(array: [i32; 8]) -> Self {
289    Self::from(array)
290  }
291
292  /// widens and sign extends to i32x8
293  #[inline]
294  #[must_use]
295  pub fn from_i16x8(v: i16x8) -> Self {
296    pick! {
297      if #[cfg(target_feature="avx2")] {
298        i32x8 { avx2:convert_to_i32_m256i_from_i16_m128i(v.sse) }
299      } else if #[cfg(target_feature="sse2")] {
300        i32x8 {
301          a: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) },
302          b: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) },
303        }
304      } else {
305        i32x8::new([
306          v.as_array_ref()[0] as i32,
307          v.as_array_ref()[1] as i32,
308          v.as_array_ref()[2] as i32,
309          v.as_array_ref()[3] as i32,
310          v.as_array_ref()[4] as i32,
311          v.as_array_ref()[5] as i32,
312          v.as_array_ref()[6] as i32,
313          v.as_array_ref()[7] as i32,
314        ])
315      }
316    }
317  }
318
319  #[inline]
320  #[must_use]
321  pub fn blend(self, t: Self, f: Self) -> Self {
322    pick! {
323      if #[cfg(target_feature="avx2")] {
324        Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) }
325      } else {
326        Self {
327          a : self.a.blend(t.a, f.a),
328          b : self.b.blend(t.b, f.b)
329        }
330      }
331    }
332  }
333
334  /// horizontal add of all the elements of the vector
335  #[inline]
336  #[must_use]
337  pub fn reduce_add(self) -> i32 {
338    let arr: [i32x4; 2] = cast(self);
339    (arr[0] + arr[1]).reduce_add()
340  }
341
342  /// horizontal max of all the elements of the vector
343  #[inline]
344  #[must_use]
345  pub fn reduce_max(self) -> i32 {
346    let arr: [i32x4; 2] = cast(self);
347    arr[0].max(arr[1]).reduce_max()
348  }
349
350  /// horizontal min of all the elements of the vector
351  #[inline]
352  #[must_use]
353  pub fn reduce_min(self) -> i32 {
354    let arr: [i32x4; 2] = cast(self);
355    arr[0].min(arr[1]).reduce_min()
356  }
357
358  #[inline]
359  #[must_use]
360  pub fn abs(self) -> Self {
361    pick! {
362      if #[cfg(target_feature="avx2")] {
363        Self { avx2: abs_i32_m256i(self.avx2) }
364      } else {
365        Self {
366          a : self.a.abs(),
367          b : self.b.abs(),
368        }
369      }
370    }
371  }
372  #[inline]
373  #[must_use]
374  pub fn max(self, rhs: Self) -> Self {
375    pick! {
376      if #[cfg(target_feature="avx2")] {
377        Self { avx2: max_i32_m256i(self.avx2, rhs.avx2) }
378      } else {
379        Self {
380          a : self.a.max(rhs.a),
381          b : self.b.max(rhs.b),
382        }
383      }
384    }
385  }
386  #[inline]
387  #[must_use]
388  pub fn min(self, rhs: Self) -> Self {
389    pick! {
390      if #[cfg(target_feature="avx2")] {
391        Self { avx2: min_i32_m256i(self.avx2, rhs.avx2) }
392      } else {
393        Self {
394          a : self.a.min(rhs.a),
395          b : self.b.min(rhs.b),
396        }
397      }
398    }
399  }
400  #[inline]
401  #[must_use]
402  pub fn round_float(self) -> f32x8 {
403    pick! {
404      if #[cfg(target_feature="avx2")] {
405        cast(convert_to_m256_from_i32_m256i(self.avx2))
406      } else {
407        cast([
408          self.a.round_float(),
409          self.b.round_float(),
410        ])
411      }
412    }
413  }
414
415  #[inline]
416  #[must_use]
417  pub fn move_mask(self) -> i32 {
418    pick! {
419      if #[cfg(target_feature="avx2")] {
420        move_mask_m256(cast(self.avx2)) as i32
421      } else {
422        self.a.move_mask() | (self.b.move_mask() << 4)
423      }
424    }
425  }
426
427  #[inline]
428  #[must_use]
429  pub fn any(self) -> bool {
430    pick! {
431      if #[cfg(target_feature="avx2")] {
432        ((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) != 0
433      } else {
434        (self.a | self.b).any()
435      }
436    }
437  }
438  #[inline]
439  #[must_use]
440  pub fn all(self) -> bool {
441    pick! {
442      if #[cfg(target_feature="avx2")] {
443        ((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) == 0b10001000100010001000100010001000
444      } else {
445        (self.a & self.b).all()
446      }
447    }
448  }
449  #[inline]
450  #[must_use]
451  pub fn none(self) -> bool {
452    !self.any()
453  }
454
455  /// Transpose matrix of 8x8 i32 matrix. Currently only accelerated on AVX2.
456  #[must_use]
457  #[inline]
458  pub fn transpose(data: [i32x8; 8]) -> [i32x8; 8] {
459    pick! {
460      if #[cfg(target_feature="avx2")] {
461        let a0 = unpack_low_i32_m256i(data[0].avx2, data[1].avx2);
462        let a1 = unpack_high_i32_m256i(data[0].avx2, data[1].avx2);
463        let a2 = unpack_low_i32_m256i(data[2].avx2, data[3].avx2);
464        let a3 = unpack_high_i32_m256i(data[2].avx2, data[3].avx2);
465        let a4 = unpack_low_i32_m256i(data[4].avx2, data[5].avx2);
466        let a5 = unpack_high_i32_m256i(data[4].avx2, data[5].avx2);
467        let a6 = unpack_low_i32_m256i(data[6].avx2, data[7].avx2);
468        let a7 = unpack_high_i32_m256i(data[6].avx2, data[7].avx2);
469
470        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
471          (z << 6) | (y << 4) | (x << 2) | w
472        }
473
474        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
475        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
476
477        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
478        // (since blend runs on a different port than shuffle)
479        let b0 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a0),cast(a2)));
480        let b1 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a0),cast(a2)));
481        let b2 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a1),cast(a3)));
482        let b3 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a1),cast(a3)));
483        let b4 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a4),cast(a6)));
484        let b5 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a4),cast(a6)));
485        let b6 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a5),cast(a7)));
486        let b7 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a5),cast(a7)));
487
488        [
489          i32x8 { avx2: permute2z_m256i::<0x20>(b0, b4) },
490          i32x8 { avx2: permute2z_m256i::<0x20>(b1, b5) },
491          i32x8 { avx2: permute2z_m256i::<0x20>(b2, b6) },
492          i32x8 { avx2: permute2z_m256i::<0x20>(b3, b7) },
493          i32x8 { avx2: permute2z_m256i::<0x31>(b0, b4) },
494          i32x8 { avx2: permute2z_m256i::<0x31>(b1, b5) },
495          i32x8 { avx2: permute2z_m256i::<0x31>(b2, b6) },
496          i32x8 { avx2: permute2z_m256i::<0x31>(b3, b7) }
497        ]
498      } else {
499        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
500
501        #[inline(always)]
502        fn transpose_column(data: &[i32x8; 8], index: usize) -> i32x8 {
503          i32x8::new([
504            data[0].as_array_ref()[index],
505            data[1].as_array_ref()[index],
506            data[2].as_array_ref()[index],
507            data[3].as_array_ref()[index],
508            data[4].as_array_ref()[index],
509            data[5].as_array_ref()[index],
510            data[6].as_array_ref()[index],
511            data[7].as_array_ref()[index],
512          ])
513        }
514
515        [
516          transpose_column(&data, 0),
517          transpose_column(&data, 1),
518          transpose_column(&data, 2),
519          transpose_column(&data, 3),
520          transpose_column(&data, 4),
521          transpose_column(&data, 5),
522          transpose_column(&data, 6),
523          transpose_column(&data, 7),
524        ]
525      }
526    }
527  }
528
529  #[inline]
530  pub fn to_array(self) -> [i32; 8] {
531    cast(self)
532  }
533
534  #[inline]
535  pub fn as_array_ref(&self) -> &[i32; 8] {
536    cast_ref(self)
537  }
538
539  #[inline]
540  pub fn as_array_mut(&mut self) -> &mut [i32; 8] {
541    cast_mut(self)
542  }
543}
544
545impl Not for i32x8 {
546  type Output = Self;
547  #[inline]
548  fn not(self) -> Self {
549    pick! {
550      if #[cfg(target_feature="avx2")] {
551        Self { avx2: self.avx2.not()  }
552      } else {
553        Self {
554          a : self.a.not(),
555          b : self.b.not(),
556        }
557      }
558    }
559  }
560}
wide/i32x8_.rs

wide/
i32x8_.rs