core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub unsafe fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    pause()
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub unsafe fn _mm_lfence() {
53    lfence()
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub unsafe fn _mm_mfence() {
69    mfence()
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    unsafe {
169        let a = simd_cast::<_, u16x16>(a.as_u8x16());
170        let b = simd_cast::<_, u16x16>(b.as_u8x16());
171        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
172        transmute(simd_cast::<_, u8x16>(r))
173    }
174}
175
176/// Averages packed unsigned 16-bit integers in `a` and `b`.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
179#[inline]
180#[target_feature(enable = "sse2")]
181#[cfg_attr(test, assert_instr(pavgw))]
182#[stable(feature = "simd_x86", since = "1.27.0")]
183pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
184    unsafe {
185        let a = simd_cast::<_, u32x8>(a.as_u16x8());
186        let b = simd_cast::<_, u32x8>(b.as_u16x8());
187        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
188        transmute(simd_cast::<_, u16x8>(r))
189    }
190}
191
192/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
193///
194/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
195/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
196/// intermediate 32-bit integers.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
199#[inline]
200#[target_feature(enable = "sse2")]
201#[cfg_attr(test, assert_instr(pmaddwd))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
204    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
205}
206
207/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
208/// maximum values.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
211#[inline]
212#[target_feature(enable = "sse2")]
213#[cfg_attr(test, assert_instr(pmaxsw))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
216    unsafe {
217        let a = a.as_i16x8();
218        let b = b.as_i16x8();
219        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
220    }
221}
222
223/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
224/// packed maximum values.
225///
226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
227#[inline]
228#[target_feature(enable = "sse2")]
229#[cfg_attr(test, assert_instr(pmaxub))]
230#[stable(feature = "simd_x86", since = "1.27.0")]
231pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
232    unsafe {
233        let a = a.as_u8x16();
234        let b = b.as_u8x16();
235        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
236    }
237}
238
239/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
240/// minimum values.
241///
242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
243#[inline]
244#[target_feature(enable = "sse2")]
245#[cfg_attr(test, assert_instr(pminsw))]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
248    unsafe {
249        let a = a.as_i16x8();
250        let b = b.as_i16x8();
251        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
252    }
253}
254
255/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
256/// packed minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminub))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
264    unsafe {
265        let a = a.as_u8x16();
266        let b = b.as_u8x16();
267        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
268    }
269}
270
271/// Multiplies the packed 16-bit integers in `a` and `b`.
272///
273/// The multiplication produces intermediate 32-bit integers, and returns the
274/// high 16 bits of the intermediate integers.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
277#[inline]
278#[target_feature(enable = "sse2")]
279#[cfg_attr(test, assert_instr(pmulhw))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
282    unsafe {
283        let a = simd_cast::<_, i32x8>(a.as_i16x8());
284        let b = simd_cast::<_, i32x8>(b.as_i16x8());
285        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
286        transmute(simd_cast::<i32x8, i16x8>(r))
287    }
288}
289
290/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
291///
292/// The multiplication produces intermediate 32-bit integers, and returns the
293/// high 16 bits of the intermediate integers.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
296#[inline]
297#[target_feature(enable = "sse2")]
298#[cfg_attr(test, assert_instr(pmulhuw))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
301    unsafe {
302        let a = simd_cast::<_, u32x8>(a.as_u16x8());
303        let b = simd_cast::<_, u32x8>(b.as_u16x8());
304        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
305        transmute(simd_cast::<u32x8, u16x8>(r))
306    }
307}
308
309/// Multiplies the packed 16-bit integers in `a` and `b`.
310///
311/// The multiplication produces intermediate 32-bit integers, and returns the
312/// low 16 bits of the intermediate integers.
313///
314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
315#[inline]
316#[target_feature(enable = "sse2")]
317#[cfg_attr(test, assert_instr(pmullw))]
318#[stable(feature = "simd_x86", since = "1.27.0")]
319pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
320    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
321}
322
323/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
324/// in `a` and `b`.
325///
326/// Returns the unsigned 64-bit results.
327///
328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
329#[inline]
330#[target_feature(enable = "sse2")]
331#[cfg_attr(test, assert_instr(pmuludq))]
332#[stable(feature = "simd_x86", since = "1.27.0")]
333pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
334    unsafe {
335        let a = a.as_u64x2();
336        let b = b.as_u64x2();
337        let mask = u64x2::splat(u32::MAX.into());
338        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
339    }
340}
341
342/// Sum the absolute differences of packed unsigned 8-bit integers.
343///
344/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
345/// and `b`, then horizontally sum each consecutive 8 differences to produce
346/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
347/// the low 16 bits of 64-bit elements returned.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
350#[inline]
351#[target_feature(enable = "sse2")]
352#[cfg_attr(test, assert_instr(psadbw))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
355    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
356}
357
358/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
359///
360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
361#[inline]
362#[target_feature(enable = "sse2")]
363#[cfg_attr(test, assert_instr(psubb))]
364#[stable(feature = "simd_x86", since = "1.27.0")]
365pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
366    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
367}
368
369/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
370///
371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
372#[inline]
373#[target_feature(enable = "sse2")]
374#[cfg_attr(test, assert_instr(psubw))]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
377    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
378}
379
380/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
383#[inline]
384#[target_feature(enable = "sse2")]
385#[cfg_attr(test, assert_instr(psubd))]
386#[stable(feature = "simd_x86", since = "1.27.0")]
387pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
388    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
389}
390
391/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
394#[inline]
395#[target_feature(enable = "sse2")]
396#[cfg_attr(test, assert_instr(psubq))]
397#[stable(feature = "simd_x86", since = "1.27.0")]
398pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
399    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
400}
401
402/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
403/// using saturation.
404///
405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
406#[inline]
407#[target_feature(enable = "sse2")]
408#[cfg_attr(test, assert_instr(psubsb))]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
411    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
412}
413
414/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
415/// using saturation.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
418#[inline]
419#[target_feature(enable = "sse2")]
420#[cfg_attr(test, assert_instr(psubsw))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
423    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
424}
425
426/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
427/// integers in `a` using saturation.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
430#[inline]
431#[target_feature(enable = "sse2")]
432#[cfg_attr(test, assert_instr(psubusb))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
435    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
436}
437
438/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
439/// integers in `a` using saturation.
440///
441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
442#[inline]
443#[target_feature(enable = "sse2")]
444#[cfg_attr(test, assert_instr(psubusw))]
445#[stable(feature = "simd_x86", since = "1.27.0")]
446pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
447    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
448}
449
450/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
451///
452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
453#[inline]
454#[target_feature(enable = "sse2")]
455#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
456#[rustc_legacy_const_generics(1)]
457#[stable(feature = "simd_x86", since = "1.27.0")]
458pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
459    static_assert_uimm_bits!(IMM8, 8);
460    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
461}
462
463/// Implementation detail: converts the immediate argument of the
464/// `_mm_slli_si128` intrinsic into a compile-time constant.
465#[inline]
466#[target_feature(enable = "sse2")]
467unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
468    const fn mask(shift: i32, i: u32) -> u32 {
469        let shift = shift as u32 & 0xff;
470        if shift > 15 { i } else { 16 - shift + i }
471    }
472    transmute::<i8x16, _>(simd_shuffle!(
473        i8x16::ZERO,
474        a.as_i8x16(),
475        [
476            mask(IMM8, 0),
477            mask(IMM8, 1),
478            mask(IMM8, 2),
479            mask(IMM8, 3),
480            mask(IMM8, 4),
481            mask(IMM8, 5),
482            mask(IMM8, 6),
483            mask(IMM8, 7),
484            mask(IMM8, 8),
485            mask(IMM8, 9),
486            mask(IMM8, 10),
487            mask(IMM8, 11),
488            mask(IMM8, 12),
489            mask(IMM8, 13),
490            mask(IMM8, 14),
491            mask(IMM8, 15),
492        ],
493    ))
494}
495
496/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
497///
498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
499#[inline]
500#[target_feature(enable = "sse2")]
501#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
502#[rustc_legacy_const_generics(1)]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
505    unsafe {
506        static_assert_uimm_bits!(IMM8, 8);
507        _mm_slli_si128_impl::<IMM8>(a)
508    }
509}
510
511/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
514#[inline]
515#[target_feature(enable = "sse2")]
516#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
517#[rustc_legacy_const_generics(1)]
518#[stable(feature = "simd_x86", since = "1.27.0")]
519pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
520    unsafe {
521        static_assert_uimm_bits!(IMM8, 8);
522        _mm_srli_si128_impl::<IMM8>(a)
523    }
524}
525
526/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
529#[inline]
530#[target_feature(enable = "sse2")]
531#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
532#[rustc_legacy_const_generics(1)]
533#[stable(feature = "simd_x86", since = "1.27.0")]
534pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
535    static_assert_uimm_bits!(IMM8, 8);
536    unsafe {
537        if IMM8 >= 16 {
538            _mm_setzero_si128()
539        } else {
540            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
541        }
542    }
543}
544
545/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
546/// zeros.
547///
548/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
549#[inline]
550#[target_feature(enable = "sse2")]
551#[cfg_attr(test, assert_instr(psllw))]
552#[stable(feature = "simd_x86", since = "1.27.0")]
553pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
554    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
555}
556
557/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
558///
559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
560#[inline]
561#[target_feature(enable = "sse2")]
562#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
563#[rustc_legacy_const_generics(1)]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
566    static_assert_uimm_bits!(IMM8, 8);
567    unsafe {
568        if IMM8 >= 32 {
569            _mm_setzero_si128()
570        } else {
571            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
572        }
573    }
574}
575
576/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
577/// zeros.
578///
579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
580#[inline]
581#[target_feature(enable = "sse2")]
582#[cfg_attr(test, assert_instr(pslld))]
583#[stable(feature = "simd_x86", since = "1.27.0")]
584pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
585    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
586}
587
588/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
589///
590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
591#[inline]
592#[target_feature(enable = "sse2")]
593#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
594#[rustc_legacy_const_generics(1)]
595#[stable(feature = "simd_x86", since = "1.27.0")]
596pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
597    static_assert_uimm_bits!(IMM8, 8);
598    unsafe {
599        if IMM8 >= 64 {
600            _mm_setzero_si128()
601        } else {
602            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
603        }
604    }
605}
606
607/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
608/// zeros.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
611#[inline]
612#[target_feature(enable = "sse2")]
613#[cfg_attr(test, assert_instr(psllq))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
616    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
617}
618
619/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
620/// bits.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
623#[inline]
624#[target_feature(enable = "sse2")]
625#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
626#[rustc_legacy_const_generics(1)]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
629    static_assert_uimm_bits!(IMM8, 8);
630    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
631}
632
633/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
634/// bits.
635///
636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
637#[inline]
638#[target_feature(enable = "sse2")]
639#[cfg_attr(test, assert_instr(psraw))]
640#[stable(feature = "simd_x86", since = "1.27.0")]
641pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
642    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
643}
644
645/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
646/// bits.
647///
648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
649#[inline]
650#[target_feature(enable = "sse2")]
651#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
652#[rustc_legacy_const_generics(1)]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
655    static_assert_uimm_bits!(IMM8, 8);
656    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
657}
658
659/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
660/// bits.
661///
662/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
663#[inline]
664#[target_feature(enable = "sse2")]
665#[cfg_attr(test, assert_instr(psrad))]
666#[stable(feature = "simd_x86", since = "1.27.0")]
667pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
668    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
669}
670
671/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
674#[inline]
675#[target_feature(enable = "sse2")]
676#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
677#[rustc_legacy_const_generics(1)]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
680    static_assert_uimm_bits!(IMM8, 8);
681    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
682}
683
684/// Implementation detail: converts the immediate argument of the
685/// `_mm_srli_si128` intrinsic into a compile-time constant.
686#[inline]
687#[target_feature(enable = "sse2")]
688unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
689    const fn mask(shift: i32, i: u32) -> u32 {
690        if (shift as u32) > 15 {
691            i + 16
692        } else {
693            i + (shift as u32)
694        }
695    }
696    let x: i8x16 = simd_shuffle!(
697        a.as_i8x16(),
698        i8x16::ZERO,
699        [
700            mask(IMM8, 0),
701            mask(IMM8, 1),
702            mask(IMM8, 2),
703            mask(IMM8, 3),
704            mask(IMM8, 4),
705            mask(IMM8, 5),
706            mask(IMM8, 6),
707            mask(IMM8, 7),
708            mask(IMM8, 8),
709            mask(IMM8, 9),
710            mask(IMM8, 10),
711            mask(IMM8, 11),
712            mask(IMM8, 12),
713            mask(IMM8, 13),
714            mask(IMM8, 14),
715            mask(IMM8, 15),
716        ],
717    );
718    transmute(x)
719}
720
721/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
722/// zeros.
723///
724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
725#[inline]
726#[target_feature(enable = "sse2")]
727#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
728#[rustc_legacy_const_generics(1)]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
731    static_assert_uimm_bits!(IMM8, 8);
732    unsafe {
733        if IMM8 >= 16 {
734            _mm_setzero_si128()
735        } else {
736            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
737        }
738    }
739}
740
741/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
742/// zeros.
743///
744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
745#[inline]
746#[target_feature(enable = "sse2")]
747#[cfg_attr(test, assert_instr(psrlw))]
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
750    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
751}
752
753/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
754/// zeros.
755///
756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
757#[inline]
758#[target_feature(enable = "sse2")]
759#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
760#[rustc_legacy_const_generics(1)]
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
763    static_assert_uimm_bits!(IMM8, 8);
764    unsafe {
765        if IMM8 >= 32 {
766            _mm_setzero_si128()
767        } else {
768            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
769        }
770    }
771}
772
773/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
774/// zeros.
775///
776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
777#[inline]
778#[target_feature(enable = "sse2")]
779#[cfg_attr(test, assert_instr(psrld))]
780#[stable(feature = "simd_x86", since = "1.27.0")]
781pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
782    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
783}
784
785/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
786/// zeros.
787///
788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
789#[inline]
790#[target_feature(enable = "sse2")]
791#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
792#[rustc_legacy_const_generics(1)]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
795    static_assert_uimm_bits!(IMM8, 8);
796    unsafe {
797        if IMM8 >= 64 {
798            _mm_setzero_si128()
799        } else {
800            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
801        }
802    }
803}
804
805/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
806/// zeros.
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
809#[inline]
810#[target_feature(enable = "sse2")]
811#[cfg_attr(test, assert_instr(psrlq))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
814    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
815}
816
817/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
818/// `b`.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
821#[inline]
822#[target_feature(enable = "sse2")]
823#[cfg_attr(test, assert_instr(andps))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
826    unsafe { simd_and(a, b) }
827}
828
829/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
830/// then AND with `b`.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
833#[inline]
834#[target_feature(enable = "sse2")]
835#[cfg_attr(test, assert_instr(andnps))]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
838    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
839}
840
841/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
842/// `b`.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(orps))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
850    unsafe { simd_or(a, b) }
851}
852
853/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(xorps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
862    unsafe { simd_xor(a, b) }
863}
864
865/// Compares packed 8-bit integers in `a` and `b` for equality.
866///
867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
868#[inline]
869#[target_feature(enable = "sse2")]
870#[cfg_attr(test, assert_instr(pcmpeqb))]
871#[stable(feature = "simd_x86", since = "1.27.0")]
872pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
873    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
874}
875
876/// Compares packed 16-bit integers in `a` and `b` for equality.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
879#[inline]
880#[target_feature(enable = "sse2")]
881#[cfg_attr(test, assert_instr(pcmpeqw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
884    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
885}
886
887/// Compares packed 32-bit integers in `a` and `b` for equality.
888///
889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
890#[inline]
891#[target_feature(enable = "sse2")]
892#[cfg_attr(test, assert_instr(pcmpeqd))]
893#[stable(feature = "simd_x86", since = "1.27.0")]
894pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
895    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
896}
897
898/// Compares packed 8-bit integers in `a` and `b` for greater-than.
899///
900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
901#[inline]
902#[target_feature(enable = "sse2")]
903#[cfg_attr(test, assert_instr(pcmpgtb))]
904#[stable(feature = "simd_x86", since = "1.27.0")]
905pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
906    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
907}
908
909/// Compares packed 16-bit integers in `a` and `b` for greater-than.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
912#[inline]
913#[target_feature(enable = "sse2")]
914#[cfg_attr(test, assert_instr(pcmpgtw))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
917    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
918}
919
920/// Compares packed 32-bit integers in `a` and `b` for greater-than.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
923#[inline]
924#[target_feature(enable = "sse2")]
925#[cfg_attr(test, assert_instr(pcmpgtd))]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
928    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
929}
930
931/// Compares packed 8-bit integers in `a` and `b` for less-than.
932///
933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
934#[inline]
935#[target_feature(enable = "sse2")]
936#[cfg_attr(test, assert_instr(pcmpgtb))]
937#[stable(feature = "simd_x86", since = "1.27.0")]
938pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
939    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
940}
941
942/// Compares packed 16-bit integers in `a` and `b` for less-than.
943///
944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
945#[inline]
946#[target_feature(enable = "sse2")]
947#[cfg_attr(test, assert_instr(pcmpgtw))]
948#[stable(feature = "simd_x86", since = "1.27.0")]
949pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
951}
952
953/// Compares packed 32-bit integers in `a` and `b` for less-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtd))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
961    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
962}
963
964/// Converts the lower two packed 32-bit integers in `a` to packed
965/// double-precision (64-bit) floating-point elements.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(cvtdq2pd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
973    unsafe {
974        let a = a.as_i32x4();
975        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
976    }
977}
978
979/// Returns `a` with its lower element replaced by `b` after converting it to
980/// an `f64`.
981///
982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
983#[inline]
984#[target_feature(enable = "sse2")]
985#[cfg_attr(test, assert_instr(cvtsi2sd))]
986#[stable(feature = "simd_x86", since = "1.27.0")]
987pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
988    unsafe { simd_insert!(a, 0, b as f64) }
989}
990
991/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
992/// floating-point elements.
993///
994/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
995#[inline]
996#[target_feature(enable = "sse2")]
997#[cfg_attr(test, assert_instr(cvtdq2ps))]
998#[stable(feature = "simd_x86", since = "1.27.0")]
999pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1000    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1001}
1002
1003/// Converts packed single-precision (32-bit) floating-point elements in `a`
1004/// to packed 32-bit integers.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1007#[inline]
1008#[target_feature(enable = "sse2")]
1009#[cfg_attr(test, assert_instr(cvtps2dq))]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1012    unsafe { transmute(cvtps2dq(a)) }
1013}
1014
1015/// Returns a vector whose lowest element is `a` and all higher elements are
1016/// `0`.
1017///
1018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1019#[inline]
1020#[target_feature(enable = "sse2")]
1021#[stable(feature = "simd_x86", since = "1.27.0")]
1022pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1023    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1024}
1025
1026/// Returns the lowest element of `a`.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1029#[inline]
1030#[target_feature(enable = "sse2")]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1033    unsafe { simd_extract!(a.as_i32x4(), 0) }
1034}
1035
1036/// Sets packed 64-bit integers with the supplied values, from highest to
1037/// lowest.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1040#[inline]
1041#[target_feature(enable = "sse2")]
1042// no particular instruction to test
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1045    unsafe { transmute(i64x2::new(e0, e1)) }
1046}
1047
1048/// Sets packed 32-bit integers with the supplied values.
1049///
1050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1051#[inline]
1052#[target_feature(enable = "sse2")]
1053// no particular instruction to test
1054#[stable(feature = "simd_x86", since = "1.27.0")]
1055pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1056    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1057}
1058
1059/// Sets packed 16-bit integers with the supplied values.
1060///
1061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1062#[inline]
1063#[target_feature(enable = "sse2")]
1064// no particular instruction to test
1065#[stable(feature = "simd_x86", since = "1.27.0")]
1066pub fn _mm_set_epi16(
1067    e7: i16,
1068    e6: i16,
1069    e5: i16,
1070    e4: i16,
1071    e3: i16,
1072    e2: i16,
1073    e1: i16,
1074    e0: i16,
1075) -> __m128i {
1076    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1077}
1078
1079/// Sets packed 8-bit integers with the supplied values.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084// no particular instruction to test
1085#[stable(feature = "simd_x86", since = "1.27.0")]
1086pub fn _mm_set_epi8(
1087    e15: i8,
1088    e14: i8,
1089    e13: i8,
1090    e12: i8,
1091    e11: i8,
1092    e10: i8,
1093    e9: i8,
1094    e8: i8,
1095    e7: i8,
1096    e6: i8,
1097    e5: i8,
1098    e4: i8,
1099    e3: i8,
1100    e2: i8,
1101    e1: i8,
1102    e0: i8,
1103) -> __m128i {
1104    unsafe {
1105        #[rustfmt::skip]
1106        transmute(i8x16::new(
1107            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1108        ))
1109    }
1110}
1111
1112/// Broadcasts 64-bit integer `a` to all elements.
1113///
1114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1115#[inline]
1116#[target_feature(enable = "sse2")]
1117// no particular instruction to test
1118#[stable(feature = "simd_x86", since = "1.27.0")]
1119pub fn _mm_set1_epi64x(a: i64) -> __m128i {
1120    _mm_set_epi64x(a, a)
1121}
1122
1123/// Broadcasts 32-bit integer `a` to all elements.
1124///
1125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1126#[inline]
1127#[target_feature(enable = "sse2")]
1128// no particular instruction to test
1129#[stable(feature = "simd_x86", since = "1.27.0")]
1130pub fn _mm_set1_epi32(a: i32) -> __m128i {
1131    _mm_set_epi32(a, a, a, a)
1132}
1133
1134/// Broadcasts 16-bit integer `a` to all elements.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1137#[inline]
1138#[target_feature(enable = "sse2")]
1139// no particular instruction to test
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_set1_epi16(a: i16) -> __m128i {
1142    _mm_set_epi16(a, a, a, a, a, a, a, a)
1143}
1144
1145/// Broadcasts 8-bit integer `a` to all elements.
1146///
1147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1148#[inline]
1149#[target_feature(enable = "sse2")]
1150// no particular instruction to test
1151#[stable(feature = "simd_x86", since = "1.27.0")]
1152pub fn _mm_set1_epi8(a: i8) -> __m128i {
1153    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1154}
1155
1156/// Sets packed 32-bit integers with the supplied values in reverse order.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1159#[inline]
1160#[target_feature(enable = "sse2")]
1161// no particular instruction to test
1162#[stable(feature = "simd_x86", since = "1.27.0")]
1163pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1164    _mm_set_epi32(e0, e1, e2, e3)
1165}
1166
1167/// Sets packed 16-bit integers with the supplied values in reverse order.
1168///
1169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1170#[inline]
1171#[target_feature(enable = "sse2")]
1172// no particular instruction to test
1173#[stable(feature = "simd_x86", since = "1.27.0")]
1174pub fn _mm_setr_epi16(
1175    e7: i16,
1176    e6: i16,
1177    e5: i16,
1178    e4: i16,
1179    e3: i16,
1180    e2: i16,
1181    e1: i16,
1182    e0: i16,
1183) -> __m128i {
1184    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1185}
1186
1187/// Sets packed 8-bit integers with the supplied values in reverse order.
1188///
1189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1190#[inline]
1191#[target_feature(enable = "sse2")]
1192// no particular instruction to test
1193#[stable(feature = "simd_x86", since = "1.27.0")]
1194pub fn _mm_setr_epi8(
1195    e15: i8,
1196    e14: i8,
1197    e13: i8,
1198    e12: i8,
1199    e11: i8,
1200    e10: i8,
1201    e9: i8,
1202    e8: i8,
1203    e7: i8,
1204    e6: i8,
1205    e5: i8,
1206    e4: i8,
1207    e3: i8,
1208    e2: i8,
1209    e1: i8,
1210    e0: i8,
1211) -> __m128i {
1212    #[rustfmt::skip]
1213    _mm_set_epi8(
1214        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1215    )
1216}
1217
1218/// Returns a vector with all elements set to zero.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223#[cfg_attr(test, assert_instr(xorps))]
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225pub fn _mm_setzero_si128() -> __m128i {
1226    const { unsafe { mem::zeroed() } }
1227}
1228
1229/// Loads 64-bit integer from memory into first element of returned vector.
1230///
1231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1232#[inline]
1233#[target_feature(enable = "sse2")]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1236    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1237}
1238
1239/// Loads 128-bits of integer data from memory into a new vector.
1240///
1241/// `mem_addr` must be aligned on a 16-byte boundary.
1242///
1243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1244#[inline]
1245#[target_feature(enable = "sse2")]
1246#[cfg_attr(test, assert_instr(movaps))]
1247#[stable(feature = "simd_x86", since = "1.27.0")]
1248pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1249    *mem_addr
1250}
1251
1252/// Loads 128-bits of integer data from memory into a new vector.
1253///
1254/// `mem_addr` does not need to be aligned on any particular boundary.
1255///
1256/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1257#[inline]
1258#[target_feature(enable = "sse2")]
1259#[cfg_attr(test, assert_instr(movups))]
1260#[stable(feature = "simd_x86", since = "1.27.0")]
1261pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1262    let mut dst: __m128i = _mm_undefined_si128();
1263    ptr::copy_nonoverlapping(
1264        mem_addr as *const u8,
1265        ptr::addr_of_mut!(dst) as *mut u8,
1266        mem::size_of::<__m128i>(),
1267    );
1268    dst
1269}
1270
1271/// Conditionally store 8-bit integer elements from `a` into memory using
1272/// `mask`.
1273///
1274/// Elements are not stored when the highest bit is not set in the
1275/// corresponding element.
1276///
1277/// `mem_addr` should correspond to a 128-bit memory location and does not need
1278/// to be aligned on any particular boundary.
1279///
1280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1281#[inline]
1282#[target_feature(enable = "sse2")]
1283#[cfg_attr(test, assert_instr(maskmovdqu))]
1284#[stable(feature = "simd_x86", since = "1.27.0")]
1285pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1286    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1287}
1288
1289/// Stores 128-bits of integer data from `a` into memory.
1290///
1291/// `mem_addr` must be aligned on a 16-byte boundary.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1294#[inline]
1295#[target_feature(enable = "sse2")]
1296#[cfg_attr(test, assert_instr(movaps))]
1297#[stable(feature = "simd_x86", since = "1.27.0")]
1298pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1299    *mem_addr = a;
1300}
1301
1302/// Stores 128-bits of integer data from `a` into memory.
1303///
1304/// `mem_addr` does not need to be aligned on any particular boundary.
1305///
1306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1307#[inline]
1308#[target_feature(enable = "sse2")]
1309#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1310#[stable(feature = "simd_x86", since = "1.27.0")]
1311pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1312    mem_addr.write_unaligned(a);
1313}
1314
1315/// Stores the lower 64-bit integer `a` to a memory location.
1316///
1317/// `mem_addr` does not need to be aligned on any particular boundary.
1318///
1319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1320#[inline]
1321#[target_feature(enable = "sse2")]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1324    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1325}
1326
1327/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1328/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1329/// used again soon).
1330///
1331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1332///
1333/// # Safety of non-temporal stores
1334///
1335/// After using this intrinsic, but before any other access to the memory that this intrinsic
1336/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1337/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1338/// return.
1339///
1340/// See [`_mm_sfence`] for details.
1341#[inline]
1342#[target_feature(enable = "sse2")]
1343#[cfg_attr(test, assert_instr(movntdq))]
1344#[stable(feature = "simd_x86", since = "1.27.0")]
1345pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1346    crate::arch::asm!(
1347        vps!("movntdq",  ",{a}"),
1348        p = in(reg) mem_addr,
1349        a = in(xmm_reg) a,
1350        options(nostack, preserves_flags),
1351    );
1352}
1353
1354/// Stores a 32-bit integer value in the specified memory location.
1355/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1356/// used again soon).
1357///
1358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1359///
1360/// # Safety of non-temporal stores
1361///
1362/// After using this intrinsic, but before any other access to the memory that this intrinsic
1363/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1364/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1365/// return.
1366///
1367/// See [`_mm_sfence`] for details.
1368#[inline]
1369#[target_feature(enable = "sse2")]
1370#[cfg_attr(test, assert_instr(movnti))]
1371#[stable(feature = "simd_x86", since = "1.27.0")]
1372pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1373    crate::arch::asm!(
1374        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1375        p = in(reg) mem_addr,
1376        a = in(reg) a,
1377        options(nostack, preserves_flags),
1378    );
1379}
1380
1381/// Returns a vector where the low element is extracted from `a` and its upper
1382/// element is zero.
1383///
1384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1385#[inline]
1386#[target_feature(enable = "sse2")]
1387// FIXME movd on msvc, movd on i686
1388#[cfg_attr(
1389    all(test, not(target_env = "msvc"), target_arch = "x86_64"),
1390    assert_instr(movq)
1391)]
1392#[stable(feature = "simd_x86", since = "1.27.0")]
1393pub fn _mm_move_epi64(a: __m128i) -> __m128i {
1394    unsafe {
1395        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1396        transmute(r)
1397    }
1398}
1399
1400/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1401/// using signed saturation.
1402///
1403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1404#[inline]
1405#[target_feature(enable = "sse2")]
1406#[cfg_attr(test, assert_instr(packsswb))]
1407#[stable(feature = "simd_x86", since = "1.27.0")]
1408pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1409    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1410}
1411
1412/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1413/// using signed saturation.
1414///
1415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1416#[inline]
1417#[target_feature(enable = "sse2")]
1418#[cfg_attr(test, assert_instr(packssdw))]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1421    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1422}
1423
1424/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1425/// using unsigned saturation.
1426///
1427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(packuswb))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1433    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1434}
1435
1436/// Returns the `imm8` element of `a`.
1437///
1438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1439#[inline]
1440#[target_feature(enable = "sse2")]
1441#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1442#[rustc_legacy_const_generics(1)]
1443#[stable(feature = "simd_x86", since = "1.27.0")]
1444pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1445    static_assert_uimm_bits!(IMM8, 3);
1446    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1447}
1448
1449/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1450///
1451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1452#[inline]
1453#[target_feature(enable = "sse2")]
1454#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1455#[rustc_legacy_const_generics(2)]
1456#[stable(feature = "simd_x86", since = "1.27.0")]
1457pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1458    static_assert_uimm_bits!(IMM8, 3);
1459    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1460}
1461
1462/// Returns a mask of the most significant bit of each element in `a`.
1463///
1464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1465#[inline]
1466#[target_feature(enable = "sse2")]
1467#[cfg_attr(test, assert_instr(pmovmskb))]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
1470    unsafe {
1471        let z = i8x16::ZERO;
1472        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1473        simd_bitmask::<_, u16>(m) as u32 as i32
1474    }
1475}
1476
1477/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1478///
1479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1480#[inline]
1481#[target_feature(enable = "sse2")]
1482#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1483#[rustc_legacy_const_generics(1)]
1484#[stable(feature = "simd_x86", since = "1.27.0")]
1485pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1486    static_assert_uimm_bits!(IMM8, 8);
1487    unsafe {
1488        let a = a.as_i32x4();
1489        let x: i32x4 = simd_shuffle!(
1490            a,
1491            a,
1492            [
1493                IMM8 as u32 & 0b11,
1494                (IMM8 as u32 >> 2) & 0b11,
1495                (IMM8 as u32 >> 4) & 0b11,
1496                (IMM8 as u32 >> 6) & 0b11,
1497            ],
1498        );
1499        transmute(x)
1500    }
1501}
1502
1503/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1504/// `IMM8`.
1505///
1506/// Put the results in the high 64 bits of the returned vector, with the low 64
1507/// bits being copied from `a`.
1508///
1509/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1510#[inline]
1511#[target_feature(enable = "sse2")]
1512#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1513#[rustc_legacy_const_generics(1)]
1514#[stable(feature = "simd_x86", since = "1.27.0")]
1515pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1516    static_assert_uimm_bits!(IMM8, 8);
1517    unsafe {
1518        let a = a.as_i16x8();
1519        let x: i16x8 = simd_shuffle!(
1520            a,
1521            a,
1522            [
1523                0,
1524                1,
1525                2,
1526                3,
1527                (IMM8 as u32 & 0b11) + 4,
1528                ((IMM8 as u32 >> 2) & 0b11) + 4,
1529                ((IMM8 as u32 >> 4) & 0b11) + 4,
1530                ((IMM8 as u32 >> 6) & 0b11) + 4,
1531            ],
1532        );
1533        transmute(x)
1534    }
1535}
1536
1537/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1538/// `IMM8`.
1539///
1540/// Put the results in the low 64 bits of the returned vector, with the high 64
1541/// bits being copied from `a`.
1542///
1543/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1544#[inline]
1545#[target_feature(enable = "sse2")]
1546#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1547#[rustc_legacy_const_generics(1)]
1548#[stable(feature = "simd_x86", since = "1.27.0")]
1549pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1550    static_assert_uimm_bits!(IMM8, 8);
1551    unsafe {
1552        let a = a.as_i16x8();
1553        let x: i16x8 = simd_shuffle!(
1554            a,
1555            a,
1556            [
1557                IMM8 as u32 & 0b11,
1558                (IMM8 as u32 >> 2) & 0b11,
1559                (IMM8 as u32 >> 4) & 0b11,
1560                (IMM8 as u32 >> 6) & 0b11,
1561                4,
1562                5,
1563                6,
1564                7,
1565            ],
1566        );
1567        transmute(x)
1568    }
1569}
1570
1571/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1574#[inline]
1575#[target_feature(enable = "sse2")]
1576#[cfg_attr(test, assert_instr(punpckhbw))]
1577#[stable(feature = "simd_x86", since = "1.27.0")]
1578pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1579    unsafe {
1580        transmute::<i8x16, _>(simd_shuffle!(
1581            a.as_i8x16(),
1582            b.as_i8x16(),
1583            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1584        ))
1585    }
1586}
1587
1588/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1589///
1590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1591#[inline]
1592#[target_feature(enable = "sse2")]
1593#[cfg_attr(test, assert_instr(punpckhwd))]
1594#[stable(feature = "simd_x86", since = "1.27.0")]
1595pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1596    unsafe {
1597        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1598        transmute::<i16x8, _>(x)
1599    }
1600}
1601
1602/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1603///
1604/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1605#[inline]
1606#[target_feature(enable = "sse2")]
1607#[cfg_attr(test, assert_instr(unpckhps))]
1608#[stable(feature = "simd_x86", since = "1.27.0")]
1609pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1610    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1611}
1612
1613/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1614///
1615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1616#[inline]
1617#[target_feature(enable = "sse2")]
1618#[cfg_attr(test, assert_instr(unpckhpd))]
1619#[stable(feature = "simd_x86", since = "1.27.0")]
1620pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1621    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1622}
1623
1624/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1625///
1626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1627#[inline]
1628#[target_feature(enable = "sse2")]
1629#[cfg_attr(test, assert_instr(punpcklbw))]
1630#[stable(feature = "simd_x86", since = "1.27.0")]
1631pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1632    unsafe {
1633        transmute::<i8x16, _>(simd_shuffle!(
1634            a.as_i8x16(),
1635            b.as_i8x16(),
1636            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1637        ))
1638    }
1639}
1640
1641/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1642///
1643/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1644#[inline]
1645#[target_feature(enable = "sse2")]
1646#[cfg_attr(test, assert_instr(punpcklwd))]
1647#[stable(feature = "simd_x86", since = "1.27.0")]
1648pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1649    unsafe {
1650        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1651        transmute::<i16x8, _>(x)
1652    }
1653}
1654
1655/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1656///
1657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1658#[inline]
1659#[target_feature(enable = "sse2")]
1660#[cfg_attr(test, assert_instr(unpcklps))]
1661#[stable(feature = "simd_x86", since = "1.27.0")]
1662pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1663    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1664}
1665
1666/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1667///
1668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1669#[inline]
1670#[target_feature(enable = "sse2")]
1671#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
1672#[stable(feature = "simd_x86", since = "1.27.0")]
1673pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1674    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1675}
1676
1677/// Returns a new vector with the low element of `a` replaced by the sum of the
1678/// low elements of `a` and `b`.
1679///
1680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1681#[inline]
1682#[target_feature(enable = "sse2")]
1683#[cfg_attr(test, assert_instr(addsd))]
1684#[stable(feature = "simd_x86", since = "1.27.0")]
1685pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1686    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1687}
1688
1689/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1690/// `b`.
1691///
1692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1693#[inline]
1694#[target_feature(enable = "sse2")]
1695#[cfg_attr(test, assert_instr(addpd))]
1696#[stable(feature = "simd_x86", since = "1.27.0")]
1697pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1698    unsafe { simd_add(a, b) }
1699}
1700
1701/// Returns a new vector with the low element of `a` replaced by the result of
1702/// diving the lower element of `a` by the lower element of `b`.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1705#[inline]
1706#[target_feature(enable = "sse2")]
1707#[cfg_attr(test, assert_instr(divsd))]
1708#[stable(feature = "simd_x86", since = "1.27.0")]
1709pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1710    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1711}
1712
1713/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1714/// packed elements in `b`.
1715///
1716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1717#[inline]
1718#[target_feature(enable = "sse2")]
1719#[cfg_attr(test, assert_instr(divpd))]
1720#[stable(feature = "simd_x86", since = "1.27.0")]
1721pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1722    unsafe { simd_div(a, b) }
1723}
1724
1725/// Returns a new vector with the low element of `a` replaced by the maximum
1726/// of the lower elements of `a` and `b`.
1727///
1728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1729#[inline]
1730#[target_feature(enable = "sse2")]
1731#[cfg_attr(test, assert_instr(maxsd))]
1732#[stable(feature = "simd_x86", since = "1.27.0")]
1733pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1734    unsafe { maxsd(a, b) }
1735}
1736
1737/// Returns a new vector with the maximum values from corresponding elements in
1738/// `a` and `b`.
1739///
1740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1741#[inline]
1742#[target_feature(enable = "sse2")]
1743#[cfg_attr(test, assert_instr(maxpd))]
1744#[stable(feature = "simd_x86", since = "1.27.0")]
1745pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1746    unsafe { maxpd(a, b) }
1747}
1748
1749/// Returns a new vector with the low element of `a` replaced by the minimum
1750/// of the lower elements of `a` and `b`.
1751///
1752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1753#[inline]
1754#[target_feature(enable = "sse2")]
1755#[cfg_attr(test, assert_instr(minsd))]
1756#[stable(feature = "simd_x86", since = "1.27.0")]
1757pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1758    unsafe { minsd(a, b) }
1759}
1760
1761/// Returns a new vector with the minimum values from corresponding elements in
1762/// `a` and `b`.
1763///
1764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1765#[inline]
1766#[target_feature(enable = "sse2")]
1767#[cfg_attr(test, assert_instr(minpd))]
1768#[stable(feature = "simd_x86", since = "1.27.0")]
1769pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1770    unsafe { minpd(a, b) }
1771}
1772
1773/// Returns a new vector with the low element of `a` replaced by multiplying the
1774/// low elements of `a` and `b`.
1775///
1776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1777#[inline]
1778#[target_feature(enable = "sse2")]
1779#[cfg_attr(test, assert_instr(mulsd))]
1780#[stable(feature = "simd_x86", since = "1.27.0")]
1781pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1782    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1783}
1784
1785/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1786/// and `b`.
1787///
1788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1789#[inline]
1790#[target_feature(enable = "sse2")]
1791#[cfg_attr(test, assert_instr(mulpd))]
1792#[stable(feature = "simd_x86", since = "1.27.0")]
1793pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1794    unsafe { simd_mul(a, b) }
1795}
1796
1797/// Returns a new vector with the low element of `a` replaced by the square
1798/// root of the lower element `b`.
1799///
1800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1801#[inline]
1802#[target_feature(enable = "sse2")]
1803#[cfg_attr(test, assert_instr(sqrtsd))]
1804#[stable(feature = "simd_x86", since = "1.27.0")]
1805pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1806    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1807}
1808
1809/// Returns a new vector with the square root of each of the values in `a`.
1810///
1811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1812#[inline]
1813#[target_feature(enable = "sse2")]
1814#[cfg_attr(test, assert_instr(sqrtpd))]
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1817    unsafe { simd_fsqrt(a) }
1818}
1819
1820/// Returns a new vector with the low element of `a` replaced by subtracting the
1821/// low element by `b` from the low element of `a`.
1822///
1823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1824#[inline]
1825#[target_feature(enable = "sse2")]
1826#[cfg_attr(test, assert_instr(subsd))]
1827#[stable(feature = "simd_x86", since = "1.27.0")]
1828pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1829    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1830}
1831
1832/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1833/// from `a`.
1834///
1835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1836#[inline]
1837#[target_feature(enable = "sse2")]
1838#[cfg_attr(test, assert_instr(subpd))]
1839#[stable(feature = "simd_x86", since = "1.27.0")]
1840pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1841    unsafe { simd_sub(a, b) }
1842}
1843
1844/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1845/// elements in `a` and `b`.
1846///
1847/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1848#[inline]
1849#[target_feature(enable = "sse2")]
1850#[cfg_attr(test, assert_instr(andps))]
1851#[stable(feature = "simd_x86", since = "1.27.0")]
1852pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1853    unsafe {
1854        let a: __m128i = transmute(a);
1855        let b: __m128i = transmute(b);
1856        transmute(_mm_and_si128(a, b))
1857    }
1858}
1859
1860/// Computes the bitwise NOT of `a` and then AND with `b`.
1861///
1862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1863#[inline]
1864#[target_feature(enable = "sse2")]
1865#[cfg_attr(test, assert_instr(andnps))]
1866#[stable(feature = "simd_x86", since = "1.27.0")]
1867pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1868    unsafe {
1869        let a: __m128i = transmute(a);
1870        let b: __m128i = transmute(b);
1871        transmute(_mm_andnot_si128(a, b))
1872    }
1873}
1874
1875/// Computes the bitwise OR of `a` and `b`.
1876///
1877/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1878#[inline]
1879#[target_feature(enable = "sse2")]
1880#[cfg_attr(test, assert_instr(orps))]
1881#[stable(feature = "simd_x86", since = "1.27.0")]
1882pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1883    unsafe {
1884        let a: __m128i = transmute(a);
1885        let b: __m128i = transmute(b);
1886        transmute(_mm_or_si128(a, b))
1887    }
1888}
1889
1890/// Computes the bitwise XOR of `a` and `b`.
1891///
1892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1893#[inline]
1894#[target_feature(enable = "sse2")]
1895#[cfg_attr(test, assert_instr(xorps))]
1896#[stable(feature = "simd_x86", since = "1.27.0")]
1897pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1898    unsafe {
1899        let a: __m128i = transmute(a);
1900        let b: __m128i = transmute(b);
1901        transmute(_mm_xor_si128(a, b))
1902    }
1903}
1904
1905/// Returns a new vector with the low element of `a` replaced by the equality
1906/// comparison of the lower elements of `a` and `b`.
1907///
1908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1909#[inline]
1910#[target_feature(enable = "sse2")]
1911#[cfg_attr(test, assert_instr(cmpeqsd))]
1912#[stable(feature = "simd_x86", since = "1.27.0")]
1913pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1914    unsafe { cmpsd(a, b, 0) }
1915}
1916
1917/// Returns a new vector with the low element of `a` replaced by the less-than
1918/// comparison of the lower elements of `a` and `b`.
1919///
1920/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1921#[inline]
1922#[target_feature(enable = "sse2")]
1923#[cfg_attr(test, assert_instr(cmpltsd))]
1924#[stable(feature = "simd_x86", since = "1.27.0")]
1925pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1926    unsafe { cmpsd(a, b, 1) }
1927}
1928
1929/// Returns a new vector with the low element of `a` replaced by the
1930/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1931///
1932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1933#[inline]
1934#[target_feature(enable = "sse2")]
1935#[cfg_attr(test, assert_instr(cmplesd))]
1936#[stable(feature = "simd_x86", since = "1.27.0")]
1937pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1938    unsafe { cmpsd(a, b, 2) }
1939}
1940
1941/// Returns a new vector with the low element of `a` replaced by the
1942/// greater-than comparison of the lower elements of `a` and `b`.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1945#[inline]
1946#[target_feature(enable = "sse2")]
1947#[cfg_attr(test, assert_instr(cmpltsd))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1950    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1951}
1952
1953/// Returns a new vector with the low element of `a` replaced by the
1954/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1955///
1956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(cmplesd))]
1960#[stable(feature = "simd_x86", since = "1.27.0")]
1961pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1962    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1963}
1964
1965/// Returns a new vector with the low element of `a` replaced by the result
1966/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1967/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1968/// otherwise.
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1971#[inline]
1972#[target_feature(enable = "sse2")]
1973#[cfg_attr(test, assert_instr(cmpordsd))]
1974#[stable(feature = "simd_x86", since = "1.27.0")]
1975pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1976    unsafe { cmpsd(a, b, 7) }
1977}
1978
1979/// Returns a new vector with the low element of `a` replaced by the result of
1980/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
1981/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
1982///
1983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
1984#[inline]
1985#[target_feature(enable = "sse2")]
1986#[cfg_attr(test, assert_instr(cmpunordsd))]
1987#[stable(feature = "simd_x86", since = "1.27.0")]
1988pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
1989    unsafe { cmpsd(a, b, 3) }
1990}
1991
1992/// Returns a new vector with the low element of `a` replaced by the not-equal
1993/// comparison of the lower elements of `a` and `b`.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
1996#[inline]
1997#[target_feature(enable = "sse2")]
1998#[cfg_attr(test, assert_instr(cmpneqsd))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2001    unsafe { cmpsd(a, b, 4) }
2002}
2003
2004/// Returns a new vector with the low element of `a` replaced by the
2005/// not-less-than comparison of the lower elements of `a` and `b`.
2006///
2007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2008#[inline]
2009#[target_feature(enable = "sse2")]
2010#[cfg_attr(test, assert_instr(cmpnltsd))]
2011#[stable(feature = "simd_x86", since = "1.27.0")]
2012pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2013    unsafe { cmpsd(a, b, 5) }
2014}
2015
2016/// Returns a new vector with the low element of `a` replaced by the
2017/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2018///
2019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2020#[inline]
2021#[target_feature(enable = "sse2")]
2022#[cfg_attr(test, assert_instr(cmpnlesd))]
2023#[stable(feature = "simd_x86", since = "1.27.0")]
2024pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2025    unsafe { cmpsd(a, b, 6) }
2026}
2027
2028/// Returns a new vector with the low element of `a` replaced by the
2029/// not-greater-than comparison of the lower elements of `a` and `b`.
2030///
2031/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2032#[inline]
2033#[target_feature(enable = "sse2")]
2034#[cfg_attr(test, assert_instr(cmpnltsd))]
2035#[stable(feature = "simd_x86", since = "1.27.0")]
2036pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2037    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2038}
2039
2040/// Returns a new vector with the low element of `a` replaced by the
2041/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2042///
2043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2044#[inline]
2045#[target_feature(enable = "sse2")]
2046#[cfg_attr(test, assert_instr(cmpnlesd))]
2047#[stable(feature = "simd_x86", since = "1.27.0")]
2048pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2049    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2050}
2051
2052/// Compares corresponding elements in `a` and `b` for equality.
2053///
2054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2055#[inline]
2056#[target_feature(enable = "sse2")]
2057#[cfg_attr(test, assert_instr(cmpeqpd))]
2058#[stable(feature = "simd_x86", since = "1.27.0")]
2059pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2060    unsafe { cmppd(a, b, 0) }
2061}
2062
2063/// Compares corresponding elements in `a` and `b` for less-than.
2064///
2065/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2066#[inline]
2067#[target_feature(enable = "sse2")]
2068#[cfg_attr(test, assert_instr(cmpltpd))]
2069#[stable(feature = "simd_x86", since = "1.27.0")]
2070pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2071    unsafe { cmppd(a, b, 1) }
2072}
2073
2074/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2075///
2076/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2077#[inline]
2078#[target_feature(enable = "sse2")]
2079#[cfg_attr(test, assert_instr(cmplepd))]
2080#[stable(feature = "simd_x86", since = "1.27.0")]
2081pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2082    unsafe { cmppd(a, b, 2) }
2083}
2084
2085/// Compares corresponding elements in `a` and `b` for greater-than.
2086///
2087/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2088#[inline]
2089#[target_feature(enable = "sse2")]
2090#[cfg_attr(test, assert_instr(cmpltpd))]
2091#[stable(feature = "simd_x86", since = "1.27.0")]
2092pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2093    _mm_cmplt_pd(b, a)
2094}
2095
2096/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2097///
2098/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2099#[inline]
2100#[target_feature(enable = "sse2")]
2101#[cfg_attr(test, assert_instr(cmplepd))]
2102#[stable(feature = "simd_x86", since = "1.27.0")]
2103pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2104    _mm_cmple_pd(b, a)
2105}
2106
2107/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2108///
2109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2110#[inline]
2111#[target_feature(enable = "sse2")]
2112#[cfg_attr(test, assert_instr(cmpordpd))]
2113#[stable(feature = "simd_x86", since = "1.27.0")]
2114pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2115    unsafe { cmppd(a, b, 7) }
2116}
2117
2118/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2119///
2120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2121#[inline]
2122#[target_feature(enable = "sse2")]
2123#[cfg_attr(test, assert_instr(cmpunordpd))]
2124#[stable(feature = "simd_x86", since = "1.27.0")]
2125pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2126    unsafe { cmppd(a, b, 3) }
2127}
2128
2129/// Compares corresponding elements in `a` and `b` for not-equal.
2130///
2131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2132#[inline]
2133#[target_feature(enable = "sse2")]
2134#[cfg_attr(test, assert_instr(cmpneqpd))]
2135#[stable(feature = "simd_x86", since = "1.27.0")]
2136pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2137    unsafe { cmppd(a, b, 4) }
2138}
2139
2140/// Compares corresponding elements in `a` and `b` for not-less-than.
2141///
2142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2143#[inline]
2144#[target_feature(enable = "sse2")]
2145#[cfg_attr(test, assert_instr(cmpnltpd))]
2146#[stable(feature = "simd_x86", since = "1.27.0")]
2147pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2148    unsafe { cmppd(a, b, 5) }
2149}
2150
2151/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2152///
2153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2154#[inline]
2155#[target_feature(enable = "sse2")]
2156#[cfg_attr(test, assert_instr(cmpnlepd))]
2157#[stable(feature = "simd_x86", since = "1.27.0")]
2158pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2159    unsafe { cmppd(a, b, 6) }
2160}
2161
2162/// Compares corresponding elements in `a` and `b` for not-greater-than.
2163///
2164/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2165#[inline]
2166#[target_feature(enable = "sse2")]
2167#[cfg_attr(test, assert_instr(cmpnltpd))]
2168#[stable(feature = "simd_x86", since = "1.27.0")]
2169pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2170    _mm_cmpnlt_pd(b, a)
2171}
2172
2173/// Compares corresponding elements in `a` and `b` for
2174/// not-greater-than-or-equal.
2175///
2176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2177#[inline]
2178#[target_feature(enable = "sse2")]
2179#[cfg_attr(test, assert_instr(cmpnlepd))]
2180#[stable(feature = "simd_x86", since = "1.27.0")]
2181pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2182    _mm_cmpnle_pd(b, a)
2183}
2184
2185/// Compares the lower element of `a` and `b` for equality.
2186///
2187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2188#[inline]
2189#[target_feature(enable = "sse2")]
2190#[cfg_attr(test, assert_instr(comisd))]
2191#[stable(feature = "simd_x86", since = "1.27.0")]
2192pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2193    unsafe { comieqsd(a, b) }
2194}
2195
2196/// Compares the lower element of `a` and `b` for less-than.
2197///
2198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2199#[inline]
2200#[target_feature(enable = "sse2")]
2201#[cfg_attr(test, assert_instr(comisd))]
2202#[stable(feature = "simd_x86", since = "1.27.0")]
2203pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2204    unsafe { comiltsd(a, b) }
2205}
2206
2207/// Compares the lower element of `a` and `b` for less-than-or-equal.
2208///
2209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2210#[inline]
2211#[target_feature(enable = "sse2")]
2212#[cfg_attr(test, assert_instr(comisd))]
2213#[stable(feature = "simd_x86", since = "1.27.0")]
2214pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2215    unsafe { comilesd(a, b) }
2216}
2217
2218/// Compares the lower element of `a` and `b` for greater-than.
2219///
2220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2221#[inline]
2222#[target_feature(enable = "sse2")]
2223#[cfg_attr(test, assert_instr(comisd))]
2224#[stable(feature = "simd_x86", since = "1.27.0")]
2225pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2226    unsafe { comigtsd(a, b) }
2227}
2228
2229/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2230///
2231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2232#[inline]
2233#[target_feature(enable = "sse2")]
2234#[cfg_attr(test, assert_instr(comisd))]
2235#[stable(feature = "simd_x86", since = "1.27.0")]
2236pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2237    unsafe { comigesd(a, b) }
2238}
2239
2240/// Compares the lower element of `a` and `b` for not-equal.
2241///
2242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2243#[inline]
2244#[target_feature(enable = "sse2")]
2245#[cfg_attr(test, assert_instr(comisd))]
2246#[stable(feature = "simd_x86", since = "1.27.0")]
2247pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2248    unsafe { comineqsd(a, b) }
2249}
2250
2251/// Compares the lower element of `a` and `b` for equality.
2252///
2253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2254#[inline]
2255#[target_feature(enable = "sse2")]
2256#[cfg_attr(test, assert_instr(ucomisd))]
2257#[stable(feature = "simd_x86", since = "1.27.0")]
2258pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2259    unsafe { ucomieqsd(a, b) }
2260}
2261
2262/// Compares the lower element of `a` and `b` for less-than.
2263///
2264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2265#[inline]
2266#[target_feature(enable = "sse2")]
2267#[cfg_attr(test, assert_instr(ucomisd))]
2268#[stable(feature = "simd_x86", since = "1.27.0")]
2269pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2270    unsafe { ucomiltsd(a, b) }
2271}
2272
2273/// Compares the lower element of `a` and `b` for less-than-or-equal.
2274///
2275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2276#[inline]
2277#[target_feature(enable = "sse2")]
2278#[cfg_attr(test, assert_instr(ucomisd))]
2279#[stable(feature = "simd_x86", since = "1.27.0")]
2280pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2281    unsafe { ucomilesd(a, b) }
2282}
2283
2284/// Compares the lower element of `a` and `b` for greater-than.
2285///
2286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2287#[inline]
2288#[target_feature(enable = "sse2")]
2289#[cfg_attr(test, assert_instr(ucomisd))]
2290#[stable(feature = "simd_x86", since = "1.27.0")]
2291pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2292    unsafe { ucomigtsd(a, b) }
2293}
2294
2295/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2296///
2297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2298#[inline]
2299#[target_feature(enable = "sse2")]
2300#[cfg_attr(test, assert_instr(ucomisd))]
2301#[stable(feature = "simd_x86", since = "1.27.0")]
2302pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2303    unsafe { ucomigesd(a, b) }
2304}
2305
2306/// Compares the lower element of `a` and `b` for not-equal.
2307///
2308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2309#[inline]
2310#[target_feature(enable = "sse2")]
2311#[cfg_attr(test, assert_instr(ucomisd))]
2312#[stable(feature = "simd_x86", since = "1.27.0")]
2313pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2314    unsafe { ucomineqsd(a, b) }
2315}
2316
2317/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2318/// packed single-precision (32-bit) floating-point elements
2319///
2320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2321#[inline]
2322#[target_feature(enable = "sse2")]
2323#[cfg_attr(test, assert_instr(cvtpd2ps))]
2324#[stable(feature = "simd_x86", since = "1.27.0")]
2325pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2326    unsafe {
2327        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2328        let zero = f32x2::ZERO;
2329        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2330    }
2331}
2332
2333/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2334/// packed
2335/// double-precision (64-bit) floating-point elements.
2336///
2337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2338#[inline]
2339#[target_feature(enable = "sse2")]
2340#[cfg_attr(test, assert_instr(cvtps2pd))]
2341#[stable(feature = "simd_x86", since = "1.27.0")]
2342pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
2343    unsafe {
2344        let a = a.as_f32x4();
2345        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2346    }
2347}
2348
2349/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2350/// packed 32-bit integers.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2353#[inline]
2354#[target_feature(enable = "sse2")]
2355#[cfg_attr(test, assert_instr(cvtpd2dq))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2358    unsafe { transmute(cvtpd2dq(a)) }
2359}
2360
2361/// Converts the lower double-precision (64-bit) floating-point element in a to
2362/// a 32-bit integer.
2363///
2364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2365#[inline]
2366#[target_feature(enable = "sse2")]
2367#[cfg_attr(test, assert_instr(cvtsd2si))]
2368#[stable(feature = "simd_x86", since = "1.27.0")]
2369pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2370    unsafe { cvtsd2si(a) }
2371}
2372
2373/// Converts the lower double-precision (64-bit) floating-point element in `b`
2374/// to a single-precision (32-bit) floating-point element, store the result in
2375/// the lower element of the return value, and copies the upper element from `a`
2376/// to the upper element the return value.
2377///
2378/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2379#[inline]
2380#[target_feature(enable = "sse2")]
2381#[cfg_attr(test, assert_instr(cvtsd2ss))]
2382#[stable(feature = "simd_x86", since = "1.27.0")]
2383pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2384    unsafe { cvtsd2ss(a, b) }
2385}
2386
2387/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2388///
2389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2390#[inline]
2391#[target_feature(enable = "sse2")]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2394    unsafe { simd_extract!(a, 0) }
2395}
2396
2397/// Converts the lower single-precision (32-bit) floating-point element in `b`
2398/// to a double-precision (64-bit) floating-point element, store the result in
2399/// the lower element of the return value, and copies the upper element from `a`
2400/// to the upper element the return value.
2401///
2402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2403#[inline]
2404#[target_feature(enable = "sse2")]
2405#[cfg_attr(test, assert_instr(cvtss2sd))]
2406#[stable(feature = "simd_x86", since = "1.27.0")]
2407pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2408    unsafe { cvtss2sd(a, b) }
2409}
2410
2411/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2412/// packed 32-bit integers with truncation.
2413///
2414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2415#[inline]
2416#[target_feature(enable = "sse2")]
2417#[cfg_attr(test, assert_instr(cvttpd2dq))]
2418#[stable(feature = "simd_x86", since = "1.27.0")]
2419pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2420    unsafe { transmute(cvttpd2dq(a)) }
2421}
2422
2423/// Converts the lower double-precision (64-bit) floating-point element in `a`
2424/// to a 32-bit integer with truncation.
2425///
2426/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2427#[inline]
2428#[target_feature(enable = "sse2")]
2429#[cfg_attr(test, assert_instr(cvttsd2si))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2432    unsafe { cvttsd2si(a) }
2433}
2434
2435/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2436/// packed 32-bit integers with truncation.
2437///
2438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2439#[inline]
2440#[target_feature(enable = "sse2")]
2441#[cfg_attr(test, assert_instr(cvttps2dq))]
2442#[stable(feature = "simd_x86", since = "1.27.0")]
2443pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2444    unsafe { transmute(cvttps2dq(a)) }
2445}
2446
2447/// Copies double-precision (64-bit) floating-point element `a` to the lower
2448/// element of the packed 64-bit return value.
2449///
2450/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2451#[inline]
2452#[target_feature(enable = "sse2")]
2453#[stable(feature = "simd_x86", since = "1.27.0")]
2454pub fn _mm_set_sd(a: f64) -> __m128d {
2455    _mm_set_pd(0.0, a)
2456}
2457
2458/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2459/// of the return value.
2460///
2461/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2462#[inline]
2463#[target_feature(enable = "sse2")]
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465pub fn _mm_set1_pd(a: f64) -> __m128d {
2466    _mm_set_pd(a, a)
2467}
2468
2469/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2470/// of the return value.
2471///
2472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2473#[inline]
2474#[target_feature(enable = "sse2")]
2475#[stable(feature = "simd_x86", since = "1.27.0")]
2476pub fn _mm_set_pd1(a: f64) -> __m128d {
2477    _mm_set_pd(a, a)
2478}
2479
2480/// Sets packed double-precision (64-bit) floating-point elements in the return
2481/// value with the supplied values.
2482///
2483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2484#[inline]
2485#[target_feature(enable = "sse2")]
2486#[stable(feature = "simd_x86", since = "1.27.0")]
2487pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2488    __m128d([b, a])
2489}
2490
2491/// Sets packed double-precision (64-bit) floating-point elements in the return
2492/// value with the supplied values in reverse order.
2493///
2494/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2495#[inline]
2496#[target_feature(enable = "sse2")]
2497#[stable(feature = "simd_x86", since = "1.27.0")]
2498pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2499    _mm_set_pd(b, a)
2500}
2501
2502/// Returns packed double-precision (64-bit) floating-point elements with all
2503/// zeros.
2504///
2505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2506#[inline]
2507#[target_feature(enable = "sse2")]
2508#[cfg_attr(test, assert_instr(xorp))]
2509#[stable(feature = "simd_x86", since = "1.27.0")]
2510pub fn _mm_setzero_pd() -> __m128d {
2511    const { unsafe { mem::zeroed() } }
2512}
2513
2514/// Returns a mask of the most significant bit of each element in `a`.
2515///
2516/// The mask is stored in the 2 least significant bits of the return value.
2517/// All other bits are set to `0`.
2518///
2519/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2520#[inline]
2521#[target_feature(enable = "sse2")]
2522#[cfg_attr(test, assert_instr(movmskpd))]
2523#[stable(feature = "simd_x86", since = "1.27.0")]
2524pub fn _mm_movemask_pd(a: __m128d) -> i32 {
2525    // Propagate the highest bit to the rest, because simd_bitmask
2526    // requires all-1 or all-0.
2527    unsafe {
2528        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2529        simd_bitmask::<i64x2, u8>(mask).into()
2530    }
2531}
2532
2533/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2534/// floating-point elements) from memory into the returned vector.
2535/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2536/// exception may be generated.
2537///
2538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2539#[inline]
2540#[target_feature(enable = "sse2")]
2541#[cfg_attr(test, assert_instr(movaps))]
2542#[stable(feature = "simd_x86", since = "1.27.0")]
2543#[allow(clippy::cast_ptr_alignment)]
2544pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2545    *(mem_addr as *const __m128d)
2546}
2547
2548/// Loads a 64-bit double-precision value to the low element of a
2549/// 128-bit integer vector and clears the upper element.
2550///
2551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2552#[inline]
2553#[target_feature(enable = "sse2")]
2554#[cfg_attr(test, assert_instr(movsd))]
2555#[stable(feature = "simd_x86", since = "1.27.0")]
2556pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2557    _mm_setr_pd(*mem_addr, 0.)
2558}
2559
2560/// Loads a double-precision value into the high-order bits of a 128-bit
2561/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2562/// bits of the first operand.
2563///
2564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2565#[inline]
2566#[target_feature(enable = "sse2")]
2567#[cfg_attr(test, assert_instr(movhps))]
2568#[stable(feature = "simd_x86", since = "1.27.0")]
2569pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2570    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2571}
2572
2573/// Loads a double-precision value into the low-order bits of a 128-bit
2574/// vector of `[2 x double]`. The high-order bits are copied from the
2575/// high-order bits of the first operand.
2576///
2577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2578#[inline]
2579#[target_feature(enable = "sse2")]
2580#[cfg_attr(test, assert_instr(movlps))]
2581#[stable(feature = "simd_x86", since = "1.27.0")]
2582pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2583    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2584}
2585
2586/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2587/// aligned memory location.
2588/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2589/// used again soon).
2590///
2591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2592///
2593/// # Safety of non-temporal stores
2594///
2595/// After using this intrinsic, but before any other access to the memory that this intrinsic
2596/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2597/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2598/// return.
2599///
2600/// See [`_mm_sfence`] for details.
2601#[inline]
2602#[target_feature(enable = "sse2")]
2603#[cfg_attr(test, assert_instr(movntpd))]
2604#[stable(feature = "simd_x86", since = "1.27.0")]
2605#[allow(clippy::cast_ptr_alignment)]
2606pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2607    crate::arch::asm!(
2608        vps!("movntpd", ",{a}"),
2609        p = in(reg) mem_addr,
2610        a = in(xmm_reg) a,
2611        options(nostack, preserves_flags),
2612    );
2613}
2614
2615/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2616/// memory location.
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2619#[inline]
2620#[target_feature(enable = "sse2")]
2621#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlps))]
2622#[stable(feature = "simd_x86", since = "1.27.0")]
2623pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2624    *mem_addr = simd_extract!(a, 0)
2625}
2626
2627/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2628/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2629/// on a 16-byte boundary or a general-protection exception may be generated.
2630///
2631/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2632#[inline]
2633#[target_feature(enable = "sse2")]
2634#[cfg_attr(test, assert_instr(movaps))]
2635#[stable(feature = "simd_x86", since = "1.27.0")]
2636#[allow(clippy::cast_ptr_alignment)]
2637pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2638    *(mem_addr as *mut __m128d) = a;
2639}
2640
2641/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2642/// floating-point elements) from `a` into memory.
2643/// `mem_addr` does not need to be aligned on any particular boundary.
2644///
2645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2646#[inline]
2647#[target_feature(enable = "sse2")]
2648#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2651    mem_addr.cast::<__m128d>().write_unaligned(a);
2652}
2653
2654/// Store 16-bit integer from the first element of a into memory.
2655///
2656/// `mem_addr` does not need to be aligned on any particular boundary.
2657///
2658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2659#[inline]
2660#[target_feature(enable = "sse2")]
2661#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2662pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2663    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2664}
2665
2666/// Store 32-bit integer from the first element of a into memory.
2667///
2668/// `mem_addr` does not need to be aligned on any particular boundary.
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2671#[inline]
2672#[target_feature(enable = "sse2")]
2673#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2674pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2675    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2676}
2677
2678/// Store 64-bit integer from the first element of a into memory.
2679///
2680/// `mem_addr` does not need to be aligned on any particular boundary.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2686pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2687    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2688}
2689
2690/// Stores the lower double-precision (64-bit) floating-point element from `a`
2691/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2692/// 16-byte boundary or a general-protection exception may be generated.
2693///
2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2695#[inline]
2696#[target_feature(enable = "sse2")]
2697#[stable(feature = "simd_x86", since = "1.27.0")]
2698#[allow(clippy::cast_ptr_alignment)]
2699pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2700    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2701    *(mem_addr as *mut __m128d) = b;
2702}
2703
2704/// Stores the lower double-precision (64-bit) floating-point element from `a`
2705/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2706/// 16-byte boundary or a general-protection exception may be generated.
2707///
2708/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2709#[inline]
2710#[target_feature(enable = "sse2")]
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712#[allow(clippy::cast_ptr_alignment)]
2713pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2714    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2715    *(mem_addr as *mut __m128d) = b;
2716}
2717
2718/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2719/// memory in reverse order.
2720/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2721/// exception may be generated.
2722///
2723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2724#[inline]
2725#[target_feature(enable = "sse2")]
2726#[stable(feature = "simd_x86", since = "1.27.0")]
2727#[allow(clippy::cast_ptr_alignment)]
2728pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2729    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2730    *(mem_addr as *mut __m128d) = b;
2731}
2732
2733/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2734/// memory location.
2735///
2736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2737#[inline]
2738#[target_feature(enable = "sse2")]
2739#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movhps))]
2740#[stable(feature = "simd_x86", since = "1.27.0")]
2741pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2742    *mem_addr = simd_extract!(a, 1);
2743}
2744
2745/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2746/// memory location.
2747///
2748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2749#[inline]
2750#[target_feature(enable = "sse2")]
2751#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlps))]
2752#[stable(feature = "simd_x86", since = "1.27.0")]
2753pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2754    *mem_addr = simd_extract!(a, 0);
2755}
2756
2757/// Loads a double-precision (64-bit) floating-point element from memory
2758/// into both elements of returned vector.
2759///
2760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2761#[inline]
2762#[target_feature(enable = "sse2")]
2763// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2764#[stable(feature = "simd_x86", since = "1.27.0")]
2765pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2766    let d = *mem_addr;
2767    _mm_setr_pd(d, d)
2768}
2769
2770/// Loads a double-precision (64-bit) floating-point element from memory
2771/// into both elements of returned vector.
2772///
2773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2774#[inline]
2775#[target_feature(enable = "sse2")]
2776// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2777#[stable(feature = "simd_x86", since = "1.27.0")]
2778pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2779    _mm_load1_pd(mem_addr)
2780}
2781
2782/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2783/// the returned vector in reverse order. `mem_addr` must be aligned on a
2784/// 16-byte boundary or a general-protection exception may be generated.
2785///
2786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2787#[inline]
2788#[target_feature(enable = "sse2")]
2789#[cfg_attr(test, assert_instr(movaps))]
2790#[stable(feature = "simd_x86", since = "1.27.0")]
2791pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2792    let a = _mm_load_pd(mem_addr);
2793    simd_shuffle!(a, a, [1, 0])
2794}
2795
2796/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2797/// floating-point elements) from memory into the returned vector.
2798/// `mem_addr` does not need to be aligned on any particular boundary.
2799///
2800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2801#[inline]
2802#[target_feature(enable = "sse2")]
2803#[cfg_attr(test, assert_instr(movups))]
2804#[stable(feature = "simd_x86", since = "1.27.0")]
2805pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2806    let mut dst = _mm_undefined_pd();
2807    ptr::copy_nonoverlapping(
2808        mem_addr as *const u8,
2809        ptr::addr_of_mut!(dst) as *mut u8,
2810        mem::size_of::<__m128d>(),
2811    );
2812    dst
2813}
2814
2815/// Loads unaligned 16-bits of integer data from memory into new vector.
2816///
2817/// `mem_addr` does not need to be aligned on any particular boundary.
2818///
2819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2820#[inline]
2821#[target_feature(enable = "sse2")]
2822#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2823pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2824    transmute(i16x8::new(
2825        ptr::read_unaligned(mem_addr as *const i16),
2826        0,
2827        0,
2828        0,
2829        0,
2830        0,
2831        0,
2832        0,
2833    ))
2834}
2835
2836/// Loads unaligned 32-bits of integer data from memory into new vector.
2837///
2838/// `mem_addr` does not need to be aligned on any particular boundary.
2839///
2840/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2841#[inline]
2842#[target_feature(enable = "sse2")]
2843#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2844pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2845    transmute(i32x4::new(
2846        ptr::read_unaligned(mem_addr as *const i32),
2847        0,
2848        0,
2849        0,
2850    ))
2851}
2852
2853/// Loads unaligned 64-bits of integer data from memory into new vector.
2854///
2855/// `mem_addr` does not need to be aligned on any particular boundary.
2856///
2857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2858#[inline]
2859#[target_feature(enable = "sse2")]
2860#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2861pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2862    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2863}
2864
2865/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2866/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2867/// parameter as a specifier.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2870#[inline]
2871#[target_feature(enable = "sse2")]
2872#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2873#[rustc_legacy_const_generics(2)]
2874#[stable(feature = "simd_x86", since = "1.27.0")]
2875pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2876    static_assert_uimm_bits!(MASK, 8);
2877    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
2878}
2879
2880/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2881/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2882/// 64 bits are set to the upper 64 bits of the first parameter.
2883///
2884/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2885#[inline]
2886#[target_feature(enable = "sse2")]
2887#[cfg_attr(test, assert_instr(movsd))]
2888#[stable(feature = "simd_x86", since = "1.27.0")]
2889pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2890    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
2891}
2892
2893/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2894/// floating-point vector of `[4 x float]`.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[stable(feature = "simd_x86", since = "1.27.0")]
2900pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
2901    unsafe { transmute(a) }
2902}
2903
2904/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2905/// integer vector.
2906///
2907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2908#[inline]
2909#[target_feature(enable = "sse2")]
2910#[stable(feature = "simd_x86", since = "1.27.0")]
2911pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
2912    unsafe { transmute(a) }
2913}
2914
2915/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2916/// floating-point vector of `[2 x double]`.
2917///
2918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2919#[inline]
2920#[target_feature(enable = "sse2")]
2921#[stable(feature = "simd_x86", since = "1.27.0")]
2922pub fn _mm_castps_pd(a: __m128) -> __m128d {
2923    unsafe { transmute(a) }
2924}
2925
2926/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2927/// integer vector.
2928///
2929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2930#[inline]
2931#[target_feature(enable = "sse2")]
2932#[stable(feature = "simd_x86", since = "1.27.0")]
2933pub fn _mm_castps_si128(a: __m128) -> __m128i {
2934    unsafe { transmute(a) }
2935}
2936
2937/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2938/// of `[2 x double]`.
2939///
2940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2941#[inline]
2942#[target_feature(enable = "sse2")]
2943#[stable(feature = "simd_x86", since = "1.27.0")]
2944pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2945    unsafe { transmute(a) }
2946}
2947
2948/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2949/// of `[4 x float]`.
2950///
2951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2952#[inline]
2953#[target_feature(enable = "sse2")]
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2956    unsafe { transmute(a) }
2957}
2958
2959/// Returns vector of type __m128d with indeterminate elements.
2960/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2961/// In practice, this is equivalent to [`mem::zeroed`].
2962///
2963/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
2964#[inline]
2965#[target_feature(enable = "sse2")]
2966#[stable(feature = "simd_x86", since = "1.27.0")]
2967pub fn _mm_undefined_pd() -> __m128d {
2968    const { unsafe { mem::zeroed() } }
2969}
2970
2971/// Returns vector of type __m128i with indeterminate elements.
2972/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2973/// In practice, this is equivalent to [`mem::zeroed`].
2974///
2975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
2976#[inline]
2977#[target_feature(enable = "sse2")]
2978#[stable(feature = "simd_x86", since = "1.27.0")]
2979pub fn _mm_undefined_si128() -> __m128i {
2980    const { unsafe { mem::zeroed() } }
2981}
2982
2983/// The resulting `__m128d` element is composed by the low-order values of
2984/// the two `__m128d` interleaved input elements, i.e.:
2985///
2986/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
2987/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
2988///
2989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
2990#[inline]
2991#[target_feature(enable = "sse2")]
2992#[cfg_attr(test, assert_instr(unpckhpd))]
2993#[stable(feature = "simd_x86", since = "1.27.0")]
2994pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
2995    unsafe { simd_shuffle!(a, b, [1, 3]) }
2996}
2997
2998/// The resulting `__m128d` element is composed by the high-order values of
2999/// the two `__m128d` interleaved input elements, i.e.:
3000///
3001/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3002/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3003///
3004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3005#[inline]
3006#[target_feature(enable = "sse2")]
3007#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
3008#[stable(feature = "simd_x86", since = "1.27.0")]
3009pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3010    unsafe { simd_shuffle!(a, b, [0, 2]) }
3011}
3012
3013#[allow(improper_ctypes)]
3014unsafe extern "C" {
3015    #[link_name = "llvm.x86.sse2.pause"]
3016    fn pause();
3017    #[link_name = "llvm.x86.sse2.clflush"]
3018    fn clflush(p: *const u8);
3019    #[link_name = "llvm.x86.sse2.lfence"]
3020    fn lfence();
3021    #[link_name = "llvm.x86.sse2.mfence"]
3022    fn mfence();
3023    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3024    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3025    #[link_name = "llvm.x86.sse2.psad.bw"]
3026    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3027    #[link_name = "llvm.x86.sse2.psll.w"]
3028    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3029    #[link_name = "llvm.x86.sse2.psll.d"]
3030    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3031    #[link_name = "llvm.x86.sse2.psll.q"]
3032    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3033    #[link_name = "llvm.x86.sse2.psra.w"]
3034    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3035    #[link_name = "llvm.x86.sse2.psra.d"]
3036    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3037    #[link_name = "llvm.x86.sse2.psrl.w"]
3038    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3039    #[link_name = "llvm.x86.sse2.psrl.d"]
3040    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3041    #[link_name = "llvm.x86.sse2.psrl.q"]
3042    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3043    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3044    fn cvtps2dq(a: __m128) -> i32x4;
3045    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3046    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3047    #[link_name = "llvm.x86.sse2.packsswb.128"]
3048    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3049    #[link_name = "llvm.x86.sse2.packssdw.128"]
3050    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3051    #[link_name = "llvm.x86.sse2.packuswb.128"]
3052    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3053    #[link_name = "llvm.x86.sse2.max.sd"]
3054    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3055    #[link_name = "llvm.x86.sse2.max.pd"]
3056    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3057    #[link_name = "llvm.x86.sse2.min.sd"]
3058    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3059    #[link_name = "llvm.x86.sse2.min.pd"]
3060    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3061    #[link_name = "llvm.x86.sse2.cmp.sd"]
3062    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3063    #[link_name = "llvm.x86.sse2.cmp.pd"]
3064    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3065    #[link_name = "llvm.x86.sse2.comieq.sd"]
3066    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3067    #[link_name = "llvm.x86.sse2.comilt.sd"]
3068    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3069    #[link_name = "llvm.x86.sse2.comile.sd"]
3070    fn comilesd(a: __m128d, b: __m128d) -> i32;
3071    #[link_name = "llvm.x86.sse2.comigt.sd"]
3072    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3073    #[link_name = "llvm.x86.sse2.comige.sd"]
3074    fn comigesd(a: __m128d, b: __m128d) -> i32;
3075    #[link_name = "llvm.x86.sse2.comineq.sd"]
3076    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3077    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3078    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3079    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3080    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3081    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3082    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3083    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3084    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3085    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3086    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3087    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3088    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3089    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3090    fn cvtpd2dq(a: __m128d) -> i32x4;
3091    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3092    fn cvtsd2si(a: __m128d) -> i32;
3093    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3094    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3095    #[link_name = "llvm.x86.sse2.cvtss2sd"]
3096    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
3097    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3098    fn cvttpd2dq(a: __m128d) -> i32x4;
3099    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3100    fn cvttsd2si(a: __m128d) -> i32;
3101    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3102    fn cvttps2dq(a: __m128) -> i32x4;
3103}
3104
3105#[cfg(test)]
3106mod tests {
3107    use crate::{
3108        core_arch::{simd::*, x86::*},
3109        hint::black_box,
3110    };
3111    use std::{
3112        boxed, f32, f64,
3113        mem::{self, transmute},
3114        ptr,
3115    };
3116    use stdarch_test::simd_test;
3117
3118    const NAN: f64 = f64::NAN;
3119
3120    #[test]
3121    fn test_mm_pause() {
3122        unsafe { _mm_pause() }
3123    }
3124
3125    #[simd_test(enable = "sse2")]
3126    unsafe fn test_mm_clflush() {
3127        let x = 0_u8;
3128        _mm_clflush(ptr::addr_of!(x));
3129    }
3130
3131    #[simd_test(enable = "sse2")]
3132    // Miri cannot support this until it is clear how it fits in the Rust memory model
3133    #[cfg_attr(miri, ignore)]
3134    unsafe fn test_mm_lfence() {
3135        _mm_lfence();
3136    }
3137
3138    #[simd_test(enable = "sse2")]
3139    // Miri cannot support this until it is clear how it fits in the Rust memory model
3140    #[cfg_attr(miri, ignore)]
3141    unsafe fn test_mm_mfence() {
3142        _mm_mfence();
3143    }
3144
3145    #[simd_test(enable = "sse2")]
3146    unsafe fn test_mm_add_epi8() {
3147        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3148        #[rustfmt::skip]
3149        let b = _mm_setr_epi8(
3150            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3151        );
3152        let r = _mm_add_epi8(a, b);
3153        #[rustfmt::skip]
3154        let e = _mm_setr_epi8(
3155            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3156        );
3157        assert_eq_m128i(r, e);
3158    }
3159
3160    #[simd_test(enable = "sse2")]
3161    unsafe fn test_mm_add_epi8_overflow() {
3162        let a = _mm_set1_epi8(0x7F);
3163        let b = _mm_set1_epi8(1);
3164        let r = _mm_add_epi8(a, b);
3165        assert_eq_m128i(r, _mm_set1_epi8(-128));
3166    }
3167
3168    #[simd_test(enable = "sse2")]
3169    unsafe fn test_mm_add_epi16() {
3170        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3171        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3172        let r = _mm_add_epi16(a, b);
3173        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3174        assert_eq_m128i(r, e);
3175    }
3176
3177    #[simd_test(enable = "sse2")]
3178    unsafe fn test_mm_add_epi32() {
3179        let a = _mm_setr_epi32(0, 1, 2, 3);
3180        let b = _mm_setr_epi32(4, 5, 6, 7);
3181        let r = _mm_add_epi32(a, b);
3182        let e = _mm_setr_epi32(4, 6, 8, 10);
3183        assert_eq_m128i(r, e);
3184    }
3185
3186    #[simd_test(enable = "sse2")]
3187    unsafe fn test_mm_add_epi64() {
3188        let a = _mm_setr_epi64x(0, 1);
3189        let b = _mm_setr_epi64x(2, 3);
3190        let r = _mm_add_epi64(a, b);
3191        let e = _mm_setr_epi64x(2, 4);
3192        assert_eq_m128i(r, e);
3193    }
3194
3195    #[simd_test(enable = "sse2")]
3196    unsafe fn test_mm_adds_epi8() {
3197        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3198        #[rustfmt::skip]
3199        let b = _mm_setr_epi8(
3200            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3201        );
3202        let r = _mm_adds_epi8(a, b);
3203        #[rustfmt::skip]
3204        let e = _mm_setr_epi8(
3205            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3206        );
3207        assert_eq_m128i(r, e);
3208    }
3209
3210    #[simd_test(enable = "sse2")]
3211    unsafe fn test_mm_adds_epi8_saturate_positive() {
3212        let a = _mm_set1_epi8(0x7F);
3213        let b = _mm_set1_epi8(1);
3214        let r = _mm_adds_epi8(a, b);
3215        assert_eq_m128i(r, a);
3216    }
3217
3218    #[simd_test(enable = "sse2")]
3219    unsafe fn test_mm_adds_epi8_saturate_negative() {
3220        let a = _mm_set1_epi8(-0x80);
3221        let b = _mm_set1_epi8(-1);
3222        let r = _mm_adds_epi8(a, b);
3223        assert_eq_m128i(r, a);
3224    }
3225
3226    #[simd_test(enable = "sse2")]
3227    unsafe fn test_mm_adds_epi16() {
3228        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3229        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3230        let r = _mm_adds_epi16(a, b);
3231        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3232        assert_eq_m128i(r, e);
3233    }
3234
3235    #[simd_test(enable = "sse2")]
3236    unsafe fn test_mm_adds_epi16_saturate_positive() {
3237        let a = _mm_set1_epi16(0x7FFF);
3238        let b = _mm_set1_epi16(1);
3239        let r = _mm_adds_epi16(a, b);
3240        assert_eq_m128i(r, a);
3241    }
3242
3243    #[simd_test(enable = "sse2")]
3244    unsafe fn test_mm_adds_epi16_saturate_negative() {
3245        let a = _mm_set1_epi16(-0x8000);
3246        let b = _mm_set1_epi16(-1);
3247        let r = _mm_adds_epi16(a, b);
3248        assert_eq_m128i(r, a);
3249    }
3250
3251    #[simd_test(enable = "sse2")]
3252    unsafe fn test_mm_adds_epu8() {
3253        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3254        #[rustfmt::skip]
3255        let b = _mm_setr_epi8(
3256            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3257        );
3258        let r = _mm_adds_epu8(a, b);
3259        #[rustfmt::skip]
3260        let e = _mm_setr_epi8(
3261            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3262        );
3263        assert_eq_m128i(r, e);
3264    }
3265
3266    #[simd_test(enable = "sse2")]
3267    unsafe fn test_mm_adds_epu8_saturate() {
3268        let a = _mm_set1_epi8(!0);
3269        let b = _mm_set1_epi8(1);
3270        let r = _mm_adds_epu8(a, b);
3271        assert_eq_m128i(r, a);
3272    }
3273
3274    #[simd_test(enable = "sse2")]
3275    unsafe fn test_mm_adds_epu16() {
3276        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3277        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3278        let r = _mm_adds_epu16(a, b);
3279        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3280        assert_eq_m128i(r, e);
3281    }
3282
3283    #[simd_test(enable = "sse2")]
3284    unsafe fn test_mm_adds_epu16_saturate() {
3285        let a = _mm_set1_epi16(!0);
3286        let b = _mm_set1_epi16(1);
3287        let r = _mm_adds_epu16(a, b);
3288        assert_eq_m128i(r, a);
3289    }
3290
3291    #[simd_test(enable = "sse2")]
3292    unsafe fn test_mm_avg_epu8() {
3293        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3294        let r = _mm_avg_epu8(a, b);
3295        assert_eq_m128i(r, _mm_set1_epi8(6));
3296    }
3297
3298    #[simd_test(enable = "sse2")]
3299    unsafe fn test_mm_avg_epu16() {
3300        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3301        let r = _mm_avg_epu16(a, b);
3302        assert_eq_m128i(r, _mm_set1_epi16(6));
3303    }
3304
3305    #[simd_test(enable = "sse2")]
3306    unsafe fn test_mm_madd_epi16() {
3307        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3308        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3309        let r = _mm_madd_epi16(a, b);
3310        let e = _mm_setr_epi32(29, 81, 149, 233);
3311        assert_eq_m128i(r, e);
3312
3313        // Test large values.
3314        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3315        let a = _mm_setr_epi16(
3316            i16::MAX,
3317            i16::MAX,
3318            i16::MIN,
3319            i16::MIN,
3320            i16::MIN,
3321            i16::MAX,
3322            0,
3323            0,
3324        );
3325        let b = _mm_setr_epi16(
3326            i16::MAX,
3327            i16::MAX,
3328            i16::MIN,
3329            i16::MIN,
3330            i16::MAX,
3331            i16::MIN,
3332            0,
3333            0,
3334        );
3335        let r = _mm_madd_epi16(a, b);
3336        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3337        assert_eq_m128i(r, e);
3338    }
3339
3340    #[simd_test(enable = "sse2")]
3341    unsafe fn test_mm_max_epi16() {
3342        let a = _mm_set1_epi16(1);
3343        let b = _mm_set1_epi16(-1);
3344        let r = _mm_max_epi16(a, b);
3345        assert_eq_m128i(r, a);
3346    }
3347
3348    #[simd_test(enable = "sse2")]
3349    unsafe fn test_mm_max_epu8() {
3350        let a = _mm_set1_epi8(1);
3351        let b = _mm_set1_epi8(!0);
3352        let r = _mm_max_epu8(a, b);
3353        assert_eq_m128i(r, b);
3354    }
3355
3356    #[simd_test(enable = "sse2")]
3357    unsafe fn test_mm_min_epi16() {
3358        let a = _mm_set1_epi16(1);
3359        let b = _mm_set1_epi16(-1);
3360        let r = _mm_min_epi16(a, b);
3361        assert_eq_m128i(r, b);
3362    }
3363
3364    #[simd_test(enable = "sse2")]
3365    unsafe fn test_mm_min_epu8() {
3366        let a = _mm_set1_epi8(1);
3367        let b = _mm_set1_epi8(!0);
3368        let r = _mm_min_epu8(a, b);
3369        assert_eq_m128i(r, a);
3370    }
3371
3372    #[simd_test(enable = "sse2")]
3373    unsafe fn test_mm_mulhi_epi16() {
3374        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3375        let r = _mm_mulhi_epi16(a, b);
3376        assert_eq_m128i(r, _mm_set1_epi16(-16));
3377    }
3378
3379    #[simd_test(enable = "sse2")]
3380    unsafe fn test_mm_mulhi_epu16() {
3381        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3382        let r = _mm_mulhi_epu16(a, b);
3383        assert_eq_m128i(r, _mm_set1_epi16(15));
3384    }
3385
3386    #[simd_test(enable = "sse2")]
3387    unsafe fn test_mm_mullo_epi16() {
3388        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3389        let r = _mm_mullo_epi16(a, b);
3390        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3391    }
3392
3393    #[simd_test(enable = "sse2")]
3394    unsafe fn test_mm_mul_epu32() {
3395        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3396        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3397        let r = _mm_mul_epu32(a, b);
3398        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3399        assert_eq_m128i(r, e);
3400    }
3401
3402    #[simd_test(enable = "sse2")]
3403    unsafe fn test_mm_sad_epu8() {
3404        #[rustfmt::skip]
3405        let a = _mm_setr_epi8(
3406            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3407            1, 2, 3, 4,
3408            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3409            1, 2, 3, 4,
3410        );
3411        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3412        let r = _mm_sad_epu8(a, b);
3413        let e = _mm_setr_epi64x(1020, 614);
3414        assert_eq_m128i(r, e);
3415    }
3416
3417    #[simd_test(enable = "sse2")]
3418    unsafe fn test_mm_sub_epi8() {
3419        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3420        let r = _mm_sub_epi8(a, b);
3421        assert_eq_m128i(r, _mm_set1_epi8(-1));
3422    }
3423
3424    #[simd_test(enable = "sse2")]
3425    unsafe fn test_mm_sub_epi16() {
3426        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3427        let r = _mm_sub_epi16(a, b);
3428        assert_eq_m128i(r, _mm_set1_epi16(-1));
3429    }
3430
3431    #[simd_test(enable = "sse2")]
3432    unsafe fn test_mm_sub_epi32() {
3433        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3434        let r = _mm_sub_epi32(a, b);
3435        assert_eq_m128i(r, _mm_set1_epi32(-1));
3436    }
3437
3438    #[simd_test(enable = "sse2")]
3439    unsafe fn test_mm_sub_epi64() {
3440        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3441        let r = _mm_sub_epi64(a, b);
3442        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3443    }
3444
3445    #[simd_test(enable = "sse2")]
3446    unsafe fn test_mm_subs_epi8() {
3447        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3448        let r = _mm_subs_epi8(a, b);
3449        assert_eq_m128i(r, _mm_set1_epi8(3));
3450    }
3451
3452    #[simd_test(enable = "sse2")]
3453    unsafe fn test_mm_subs_epi8_saturate_positive() {
3454        let a = _mm_set1_epi8(0x7F);
3455        let b = _mm_set1_epi8(-1);
3456        let r = _mm_subs_epi8(a, b);
3457        assert_eq_m128i(r, a);
3458    }
3459
3460    #[simd_test(enable = "sse2")]
3461    unsafe fn test_mm_subs_epi8_saturate_negative() {
3462        let a = _mm_set1_epi8(-0x80);
3463        let b = _mm_set1_epi8(1);
3464        let r = _mm_subs_epi8(a, b);
3465        assert_eq_m128i(r, a);
3466    }
3467
3468    #[simd_test(enable = "sse2")]
3469    unsafe fn test_mm_subs_epi16() {
3470        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3471        let r = _mm_subs_epi16(a, b);
3472        assert_eq_m128i(r, _mm_set1_epi16(3));
3473    }
3474
3475    #[simd_test(enable = "sse2")]
3476    unsafe fn test_mm_subs_epi16_saturate_positive() {
3477        let a = _mm_set1_epi16(0x7FFF);
3478        let b = _mm_set1_epi16(-1);
3479        let r = _mm_subs_epi16(a, b);
3480        assert_eq_m128i(r, a);
3481    }
3482
3483    #[simd_test(enable = "sse2")]
3484    unsafe fn test_mm_subs_epi16_saturate_negative() {
3485        let a = _mm_set1_epi16(-0x8000);
3486        let b = _mm_set1_epi16(1);
3487        let r = _mm_subs_epi16(a, b);
3488        assert_eq_m128i(r, a);
3489    }
3490
3491    #[simd_test(enable = "sse2")]
3492    unsafe fn test_mm_subs_epu8() {
3493        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3494        let r = _mm_subs_epu8(a, b);
3495        assert_eq_m128i(r, _mm_set1_epi8(3));
3496    }
3497
3498    #[simd_test(enable = "sse2")]
3499    unsafe fn test_mm_subs_epu8_saturate() {
3500        let a = _mm_set1_epi8(0);
3501        let b = _mm_set1_epi8(1);
3502        let r = _mm_subs_epu8(a, b);
3503        assert_eq_m128i(r, a);
3504    }
3505
3506    #[simd_test(enable = "sse2")]
3507    unsafe fn test_mm_subs_epu16() {
3508        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3509        let r = _mm_subs_epu16(a, b);
3510        assert_eq_m128i(r, _mm_set1_epi16(3));
3511    }
3512
3513    #[simd_test(enable = "sse2")]
3514    unsafe fn test_mm_subs_epu16_saturate() {
3515        let a = _mm_set1_epi16(0);
3516        let b = _mm_set1_epi16(1);
3517        let r = _mm_subs_epu16(a, b);
3518        assert_eq_m128i(r, a);
3519    }
3520
3521    #[simd_test(enable = "sse2")]
3522    unsafe fn test_mm_slli_si128() {
3523        #[rustfmt::skip]
3524        let a = _mm_setr_epi8(
3525            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3526        );
3527        let r = _mm_slli_si128::<1>(a);
3528        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3529        assert_eq_m128i(r, e);
3530
3531        #[rustfmt::skip]
3532        let a = _mm_setr_epi8(
3533            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3534        );
3535        let r = _mm_slli_si128::<15>(a);
3536        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3537        assert_eq_m128i(r, e);
3538
3539        #[rustfmt::skip]
3540        let a = _mm_setr_epi8(
3541            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3542        );
3543        let r = _mm_slli_si128::<16>(a);
3544        assert_eq_m128i(r, _mm_set1_epi8(0));
3545    }
3546
3547    #[simd_test(enable = "sse2")]
3548    unsafe fn test_mm_slli_epi16() {
3549        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3550        let r = _mm_slli_epi16::<4>(a);
3551        assert_eq_m128i(
3552            r,
3553            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3554        );
3555        let r = _mm_slli_epi16::<16>(a);
3556        assert_eq_m128i(r, _mm_set1_epi16(0));
3557    }
3558
3559    #[simd_test(enable = "sse2")]
3560    unsafe fn test_mm_sll_epi16() {
3561        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3562        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3563        assert_eq_m128i(
3564            r,
3565            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3566        );
3567        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3568        assert_eq_m128i(r, a);
3569        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3570        assert_eq_m128i(r, _mm_set1_epi16(0));
3571        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3572        assert_eq_m128i(r, _mm_set1_epi16(0));
3573    }
3574
3575    #[simd_test(enable = "sse2")]
3576    unsafe fn test_mm_slli_epi32() {
3577        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3578        let r = _mm_slli_epi32::<4>(a);
3579        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3580        let r = _mm_slli_epi32::<32>(a);
3581        assert_eq_m128i(r, _mm_set1_epi32(0));
3582    }
3583
3584    #[simd_test(enable = "sse2")]
3585    unsafe fn test_mm_sll_epi32() {
3586        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3587        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3588        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3589        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3590        assert_eq_m128i(r, a);
3591        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3592        assert_eq_m128i(r, _mm_set1_epi32(0));
3593        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3594        assert_eq_m128i(r, _mm_set1_epi32(0));
3595    }
3596
3597    #[simd_test(enable = "sse2")]
3598    unsafe fn test_mm_slli_epi64() {
3599        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3600        let r = _mm_slli_epi64::<4>(a);
3601        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3602        let r = _mm_slli_epi64::<64>(a);
3603        assert_eq_m128i(r, _mm_set1_epi64x(0));
3604    }
3605
3606    #[simd_test(enable = "sse2")]
3607    unsafe fn test_mm_sll_epi64() {
3608        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3609        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3610        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3611        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3612        assert_eq_m128i(r, a);
3613        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3614        assert_eq_m128i(r, _mm_set1_epi64x(0));
3615        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3616        assert_eq_m128i(r, _mm_set1_epi64x(0));
3617    }
3618
3619    #[simd_test(enable = "sse2")]
3620    unsafe fn test_mm_srai_epi16() {
3621        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3622        let r = _mm_srai_epi16::<4>(a);
3623        assert_eq_m128i(
3624            r,
3625            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3626        );
3627        let r = _mm_srai_epi16::<16>(a);
3628        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3629    }
3630
3631    #[simd_test(enable = "sse2")]
3632    unsafe fn test_mm_sra_epi16() {
3633        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3634        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3635        assert_eq_m128i(
3636            r,
3637            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3638        );
3639        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3640        assert_eq_m128i(r, a);
3641        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3642        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3643        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3644        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3645    }
3646
3647    #[simd_test(enable = "sse2")]
3648    unsafe fn test_mm_srai_epi32() {
3649        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3650        let r = _mm_srai_epi32::<4>(a);
3651        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3652        let r = _mm_srai_epi32::<32>(a);
3653        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3654    }
3655
3656    #[simd_test(enable = "sse2")]
3657    unsafe fn test_mm_sra_epi32() {
3658        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3659        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3660        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3661        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3662        assert_eq_m128i(r, a);
3663        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3664        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3665        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3666        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3667    }
3668
3669    #[simd_test(enable = "sse2")]
3670    unsafe fn test_mm_srli_si128() {
3671        #[rustfmt::skip]
3672        let a = _mm_setr_epi8(
3673            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3674        );
3675        let r = _mm_srli_si128::<1>(a);
3676        #[rustfmt::skip]
3677        let e = _mm_setr_epi8(
3678            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3679        );
3680        assert_eq_m128i(r, e);
3681
3682        #[rustfmt::skip]
3683        let a = _mm_setr_epi8(
3684            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3685        );
3686        let r = _mm_srli_si128::<15>(a);
3687        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3688        assert_eq_m128i(r, e);
3689
3690        #[rustfmt::skip]
3691        let a = _mm_setr_epi8(
3692            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3693        );
3694        let r = _mm_srli_si128::<16>(a);
3695        assert_eq_m128i(r, _mm_set1_epi8(0));
3696    }
3697
3698    #[simd_test(enable = "sse2")]
3699    unsafe fn test_mm_srli_epi16() {
3700        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3701        let r = _mm_srli_epi16::<4>(a);
3702        assert_eq_m128i(
3703            r,
3704            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3705        );
3706        let r = _mm_srli_epi16::<16>(a);
3707        assert_eq_m128i(r, _mm_set1_epi16(0));
3708    }
3709
3710    #[simd_test(enable = "sse2")]
3711    unsafe fn test_mm_srl_epi16() {
3712        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3713        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3714        assert_eq_m128i(
3715            r,
3716            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3717        );
3718        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3719        assert_eq_m128i(r, a);
3720        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3721        assert_eq_m128i(r, _mm_set1_epi16(0));
3722        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3723        assert_eq_m128i(r, _mm_set1_epi16(0));
3724    }
3725
3726    #[simd_test(enable = "sse2")]
3727    unsafe fn test_mm_srli_epi32() {
3728        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3729        let r = _mm_srli_epi32::<4>(a);
3730        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3731        let r = _mm_srli_epi32::<32>(a);
3732        assert_eq_m128i(r, _mm_set1_epi32(0));
3733    }
3734
3735    #[simd_test(enable = "sse2")]
3736    unsafe fn test_mm_srl_epi32() {
3737        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3738        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3739        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3740        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3741        assert_eq_m128i(r, a);
3742        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3743        assert_eq_m128i(r, _mm_set1_epi32(0));
3744        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3745        assert_eq_m128i(r, _mm_set1_epi32(0));
3746    }
3747
3748    #[simd_test(enable = "sse2")]
3749    unsafe fn test_mm_srli_epi64() {
3750        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3751        let r = _mm_srli_epi64::<4>(a);
3752        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3753        let r = _mm_srli_epi64::<64>(a);
3754        assert_eq_m128i(r, _mm_set1_epi64x(0));
3755    }
3756
3757    #[simd_test(enable = "sse2")]
3758    unsafe fn test_mm_srl_epi64() {
3759        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3760        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3761        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3762        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3763        assert_eq_m128i(r, a);
3764        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3765        assert_eq_m128i(r, _mm_set1_epi64x(0));
3766        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3767        assert_eq_m128i(r, _mm_set1_epi64x(0));
3768    }
3769
3770    #[simd_test(enable = "sse2")]
3771    unsafe fn test_mm_and_si128() {
3772        let a = _mm_set1_epi8(5);
3773        let b = _mm_set1_epi8(3);
3774        let r = _mm_and_si128(a, b);
3775        assert_eq_m128i(r, _mm_set1_epi8(1));
3776    }
3777
3778    #[simd_test(enable = "sse2")]
3779    unsafe fn test_mm_andnot_si128() {
3780        let a = _mm_set1_epi8(5);
3781        let b = _mm_set1_epi8(3);
3782        let r = _mm_andnot_si128(a, b);
3783        assert_eq_m128i(r, _mm_set1_epi8(2));
3784    }
3785
3786    #[simd_test(enable = "sse2")]
3787    unsafe fn test_mm_or_si128() {
3788        let a = _mm_set1_epi8(5);
3789        let b = _mm_set1_epi8(3);
3790        let r = _mm_or_si128(a, b);
3791        assert_eq_m128i(r, _mm_set1_epi8(7));
3792    }
3793
3794    #[simd_test(enable = "sse2")]
3795    unsafe fn test_mm_xor_si128() {
3796        let a = _mm_set1_epi8(5);
3797        let b = _mm_set1_epi8(3);
3798        let r = _mm_xor_si128(a, b);
3799        assert_eq_m128i(r, _mm_set1_epi8(6));
3800    }
3801
3802    #[simd_test(enable = "sse2")]
3803    unsafe fn test_mm_cmpeq_epi8() {
3804        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3805        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3806        let r = _mm_cmpeq_epi8(a, b);
3807        #[rustfmt::skip]
3808        assert_eq_m128i(
3809            r,
3810            _mm_setr_epi8(
3811                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3812            )
3813        );
3814    }
3815
3816    #[simd_test(enable = "sse2")]
3817    unsafe fn test_mm_cmpeq_epi16() {
3818        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3819        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3820        let r = _mm_cmpeq_epi16(a, b);
3821        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3822    }
3823
3824    #[simd_test(enable = "sse2")]
3825    unsafe fn test_mm_cmpeq_epi32() {
3826        let a = _mm_setr_epi32(0, 1, 2, 3);
3827        let b = _mm_setr_epi32(3, 2, 2, 0);
3828        let r = _mm_cmpeq_epi32(a, b);
3829        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3830    }
3831
3832    #[simd_test(enable = "sse2")]
3833    unsafe fn test_mm_cmpgt_epi8() {
3834        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3835        let b = _mm_set1_epi8(0);
3836        let r = _mm_cmpgt_epi8(a, b);
3837        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3838        assert_eq_m128i(r, e);
3839    }
3840
3841    #[simd_test(enable = "sse2")]
3842    unsafe fn test_mm_cmpgt_epi16() {
3843        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3844        let b = _mm_set1_epi16(0);
3845        let r = _mm_cmpgt_epi16(a, b);
3846        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3847        assert_eq_m128i(r, e);
3848    }
3849
3850    #[simd_test(enable = "sse2")]
3851    unsafe fn test_mm_cmpgt_epi32() {
3852        let a = _mm_set_epi32(5, 0, 0, 0);
3853        let b = _mm_set1_epi32(0);
3854        let r = _mm_cmpgt_epi32(a, b);
3855        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3856    }
3857
3858    #[simd_test(enable = "sse2")]
3859    unsafe fn test_mm_cmplt_epi8() {
3860        let a = _mm_set1_epi8(0);
3861        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3862        let r = _mm_cmplt_epi8(a, b);
3863        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3864        assert_eq_m128i(r, e);
3865    }
3866
3867    #[simd_test(enable = "sse2")]
3868    unsafe fn test_mm_cmplt_epi16() {
3869        let a = _mm_set1_epi16(0);
3870        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3871        let r = _mm_cmplt_epi16(a, b);
3872        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3873        assert_eq_m128i(r, e);
3874    }
3875
3876    #[simd_test(enable = "sse2")]
3877    unsafe fn test_mm_cmplt_epi32() {
3878        let a = _mm_set1_epi32(0);
3879        let b = _mm_set_epi32(5, 0, 0, 0);
3880        let r = _mm_cmplt_epi32(a, b);
3881        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3882    }
3883
3884    #[simd_test(enable = "sse2")]
3885    unsafe fn test_mm_cvtepi32_pd() {
3886        let a = _mm_set_epi32(35, 25, 15, 5);
3887        let r = _mm_cvtepi32_pd(a);
3888        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3889    }
3890
3891    #[simd_test(enable = "sse2")]
3892    unsafe fn test_mm_cvtsi32_sd() {
3893        let a = _mm_set1_pd(3.5);
3894        let r = _mm_cvtsi32_sd(a, 5);
3895        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3896    }
3897
3898    #[simd_test(enable = "sse2")]
3899    unsafe fn test_mm_cvtepi32_ps() {
3900        let a = _mm_setr_epi32(1, 2, 3, 4);
3901        let r = _mm_cvtepi32_ps(a);
3902        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3903    }
3904
3905    #[simd_test(enable = "sse2")]
3906    unsafe fn test_mm_cvtps_epi32() {
3907        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3908        let r = _mm_cvtps_epi32(a);
3909        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3910    }
3911
3912    #[simd_test(enable = "sse2")]
3913    unsafe fn test_mm_cvtsi32_si128() {
3914        let r = _mm_cvtsi32_si128(5);
3915        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3916    }
3917
3918    #[simd_test(enable = "sse2")]
3919    unsafe fn test_mm_cvtsi128_si32() {
3920        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3921        assert_eq!(r, 5);
3922    }
3923
3924    #[simd_test(enable = "sse2")]
3925    unsafe fn test_mm_set_epi64x() {
3926        let r = _mm_set_epi64x(0, 1);
3927        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3928    }
3929
3930    #[simd_test(enable = "sse2")]
3931    unsafe fn test_mm_set_epi32() {
3932        let r = _mm_set_epi32(0, 1, 2, 3);
3933        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3934    }
3935
3936    #[simd_test(enable = "sse2")]
3937    unsafe fn test_mm_set_epi16() {
3938        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3939        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3940    }
3941
3942    #[simd_test(enable = "sse2")]
3943    unsafe fn test_mm_set_epi8() {
3944        #[rustfmt::skip]
3945        let r = _mm_set_epi8(
3946            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3947        );
3948        #[rustfmt::skip]
3949        let e = _mm_setr_epi8(
3950            15, 14, 13, 12, 11, 10, 9, 8,
3951            7, 6, 5, 4, 3, 2, 1, 0,
3952        );
3953        assert_eq_m128i(r, e);
3954    }
3955
3956    #[simd_test(enable = "sse2")]
3957    unsafe fn test_mm_set1_epi64x() {
3958        let r = _mm_set1_epi64x(1);
3959        assert_eq_m128i(r, _mm_set1_epi64x(1));
3960    }
3961
3962    #[simd_test(enable = "sse2")]
3963    unsafe fn test_mm_set1_epi32() {
3964        let r = _mm_set1_epi32(1);
3965        assert_eq_m128i(r, _mm_set1_epi32(1));
3966    }
3967
3968    #[simd_test(enable = "sse2")]
3969    unsafe fn test_mm_set1_epi16() {
3970        let r = _mm_set1_epi16(1);
3971        assert_eq_m128i(r, _mm_set1_epi16(1));
3972    }
3973
3974    #[simd_test(enable = "sse2")]
3975    unsafe fn test_mm_set1_epi8() {
3976        let r = _mm_set1_epi8(1);
3977        assert_eq_m128i(r, _mm_set1_epi8(1));
3978    }
3979
3980    #[simd_test(enable = "sse2")]
3981    unsafe fn test_mm_setr_epi32() {
3982        let r = _mm_setr_epi32(0, 1, 2, 3);
3983        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
3984    }
3985
3986    #[simd_test(enable = "sse2")]
3987    unsafe fn test_mm_setr_epi16() {
3988        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3989        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
3990    }
3991
3992    #[simd_test(enable = "sse2")]
3993    unsafe fn test_mm_setr_epi8() {
3994        #[rustfmt::skip]
3995        let r = _mm_setr_epi8(
3996            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3997        );
3998        #[rustfmt::skip]
3999        let e = _mm_setr_epi8(
4000            0, 1, 2, 3, 4, 5, 6, 7,
4001            8, 9, 10, 11, 12, 13, 14, 15,
4002        );
4003        assert_eq_m128i(r, e);
4004    }
4005
4006    #[simd_test(enable = "sse2")]
4007    unsafe fn test_mm_setzero_si128() {
4008        let r = _mm_setzero_si128();
4009        assert_eq_m128i(r, _mm_set1_epi64x(0));
4010    }
4011
4012    #[simd_test(enable = "sse2")]
4013    unsafe fn test_mm_loadl_epi64() {
4014        let a = _mm_setr_epi64x(6, 5);
4015        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4016        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4017    }
4018
4019    #[simd_test(enable = "sse2")]
4020    unsafe fn test_mm_load_si128() {
4021        let a = _mm_set_epi64x(5, 6);
4022        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4023        assert_eq_m128i(a, r);
4024    }
4025
4026    #[simd_test(enable = "sse2")]
4027    unsafe fn test_mm_loadu_si128() {
4028        let a = _mm_set_epi64x(5, 6);
4029        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4030        assert_eq_m128i(a, r);
4031    }
4032
4033    #[simd_test(enable = "sse2")]
4034    // Miri cannot support this until it is clear how it fits in the Rust memory model
4035    // (non-temporal store)
4036    #[cfg_attr(miri, ignore)]
4037    unsafe fn test_mm_maskmoveu_si128() {
4038        let a = _mm_set1_epi8(9);
4039        #[rustfmt::skip]
4040        let mask = _mm_set_epi8(
4041            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4042            0, 0, 0, 0, 0, 0, 0, 0,
4043        );
4044        let mut r = _mm_set1_epi8(0);
4045        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4046        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4047        assert_eq_m128i(r, e);
4048    }
4049
4050    #[simd_test(enable = "sse2")]
4051    unsafe fn test_mm_store_si128() {
4052        let a = _mm_set1_epi8(9);
4053        let mut r = _mm_set1_epi8(0);
4054        _mm_store_si128(&mut r, a);
4055        assert_eq_m128i(r, a);
4056    }
4057
4058    #[simd_test(enable = "sse2")]
4059    unsafe fn test_mm_storeu_si128() {
4060        let a = _mm_set1_epi8(9);
4061        let mut r = _mm_set1_epi8(0);
4062        _mm_storeu_si128(&mut r, a);
4063        assert_eq_m128i(r, a);
4064    }
4065
4066    #[simd_test(enable = "sse2")]
4067    unsafe fn test_mm_storel_epi64() {
4068        let a = _mm_setr_epi64x(2, 9);
4069        let mut r = _mm_set1_epi8(0);
4070        _mm_storel_epi64(&mut r, a);
4071        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4072    }
4073
4074    #[simd_test(enable = "sse2")]
4075    // Miri cannot support this until it is clear how it fits in the Rust memory model
4076    // (non-temporal store)
4077    #[cfg_attr(miri, ignore)]
4078    unsafe fn test_mm_stream_si128() {
4079        let a = _mm_setr_epi32(1, 2, 3, 4);
4080        let mut r = _mm_undefined_si128();
4081        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4082        assert_eq_m128i(r, a);
4083    }
4084
4085    #[simd_test(enable = "sse2")]
4086    // Miri cannot support this until it is clear how it fits in the Rust memory model
4087    // (non-temporal store)
4088    #[cfg_attr(miri, ignore)]
4089    unsafe fn test_mm_stream_si32() {
4090        let a: i32 = 7;
4091        let mut mem = boxed::Box::<i32>::new(-1);
4092        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4093        assert_eq!(a, *mem);
4094    }
4095
4096    #[simd_test(enable = "sse2")]
4097    unsafe fn test_mm_move_epi64() {
4098        let a = _mm_setr_epi64x(5, 6);
4099        let r = _mm_move_epi64(a);
4100        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4101    }
4102
4103    #[simd_test(enable = "sse2")]
4104    unsafe fn test_mm_packs_epi16() {
4105        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4106        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4107        let r = _mm_packs_epi16(a, b);
4108        #[rustfmt::skip]
4109        assert_eq_m128i(
4110            r,
4111            _mm_setr_epi8(
4112                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4113            )
4114        );
4115    }
4116
4117    #[simd_test(enable = "sse2")]
4118    unsafe fn test_mm_packs_epi32() {
4119        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4120        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4121        let r = _mm_packs_epi32(a, b);
4122        assert_eq_m128i(
4123            r,
4124            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4125        );
4126    }
4127
4128    #[simd_test(enable = "sse2")]
4129    unsafe fn test_mm_packus_epi16() {
4130        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4131        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4132        let r = _mm_packus_epi16(a, b);
4133        assert_eq_m128i(
4134            r,
4135            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4136        );
4137    }
4138
4139    #[simd_test(enable = "sse2")]
4140    unsafe fn test_mm_extract_epi16() {
4141        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4142        let r1 = _mm_extract_epi16::<0>(a);
4143        let r2 = _mm_extract_epi16::<3>(a);
4144        assert_eq!(r1, 0xFFFF);
4145        assert_eq!(r2, 3);
4146    }
4147
4148    #[simd_test(enable = "sse2")]
4149    unsafe fn test_mm_insert_epi16() {
4150        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4151        let r = _mm_insert_epi16::<0>(a, 9);
4152        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4153        assert_eq_m128i(r, e);
4154    }
4155
4156    #[simd_test(enable = "sse2")]
4157    unsafe fn test_mm_movemask_epi8() {
4158        #[rustfmt::skip]
4159        let a = _mm_setr_epi8(
4160            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4161            0b0101, 0b1111_0000u8 as i8, 0, 0,
4162            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4163            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4164        );
4165        let r = _mm_movemask_epi8(a);
4166        assert_eq!(r, 0b10100110_00100101);
4167    }
4168
4169    #[simd_test(enable = "sse2")]
4170    unsafe fn test_mm_shuffle_epi32() {
4171        let a = _mm_setr_epi32(5, 10, 15, 20);
4172        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4173        let e = _mm_setr_epi32(20, 10, 10, 5);
4174        assert_eq_m128i(r, e);
4175    }
4176
4177    #[simd_test(enable = "sse2")]
4178    unsafe fn test_mm_shufflehi_epi16() {
4179        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4180        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4181        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4182        assert_eq_m128i(r, e);
4183    }
4184
4185    #[simd_test(enable = "sse2")]
4186    unsafe fn test_mm_shufflelo_epi16() {
4187        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4188        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4189        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4190        assert_eq_m128i(r, e);
4191    }
4192
4193    #[simd_test(enable = "sse2")]
4194    unsafe fn test_mm_unpackhi_epi8() {
4195        #[rustfmt::skip]
4196        let a = _mm_setr_epi8(
4197            0, 1, 2, 3, 4, 5, 6, 7,
4198            8, 9, 10, 11, 12, 13, 14, 15,
4199        );
4200        #[rustfmt::skip]
4201        let b = _mm_setr_epi8(
4202            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4203        );
4204        let r = _mm_unpackhi_epi8(a, b);
4205        #[rustfmt::skip]
4206        let e = _mm_setr_epi8(
4207            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4208        );
4209        assert_eq_m128i(r, e);
4210    }
4211
4212    #[simd_test(enable = "sse2")]
4213    unsafe fn test_mm_unpackhi_epi16() {
4214        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4215        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4216        let r = _mm_unpackhi_epi16(a, b);
4217        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4218        assert_eq_m128i(r, e);
4219    }
4220
4221    #[simd_test(enable = "sse2")]
4222    unsafe fn test_mm_unpackhi_epi32() {
4223        let a = _mm_setr_epi32(0, 1, 2, 3);
4224        let b = _mm_setr_epi32(4, 5, 6, 7);
4225        let r = _mm_unpackhi_epi32(a, b);
4226        let e = _mm_setr_epi32(2, 6, 3, 7);
4227        assert_eq_m128i(r, e);
4228    }
4229
4230    #[simd_test(enable = "sse2")]
4231    unsafe fn test_mm_unpackhi_epi64() {
4232        let a = _mm_setr_epi64x(0, 1);
4233        let b = _mm_setr_epi64x(2, 3);
4234        let r = _mm_unpackhi_epi64(a, b);
4235        let e = _mm_setr_epi64x(1, 3);
4236        assert_eq_m128i(r, e);
4237    }
4238
4239    #[simd_test(enable = "sse2")]
4240    unsafe fn test_mm_unpacklo_epi8() {
4241        #[rustfmt::skip]
4242        let a = _mm_setr_epi8(
4243            0, 1, 2, 3, 4, 5, 6, 7,
4244            8, 9, 10, 11, 12, 13, 14, 15,
4245        );
4246        #[rustfmt::skip]
4247        let b = _mm_setr_epi8(
4248            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4249        );
4250        let r = _mm_unpacklo_epi8(a, b);
4251        #[rustfmt::skip]
4252        let e = _mm_setr_epi8(
4253            0, 16, 1, 17, 2, 18, 3, 19,
4254            4, 20, 5, 21, 6, 22, 7, 23,
4255        );
4256        assert_eq_m128i(r, e);
4257    }
4258
4259    #[simd_test(enable = "sse2")]
4260    unsafe fn test_mm_unpacklo_epi16() {
4261        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4262        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4263        let r = _mm_unpacklo_epi16(a, b);
4264        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4265        assert_eq_m128i(r, e);
4266    }
4267
4268    #[simd_test(enable = "sse2")]
4269    unsafe fn test_mm_unpacklo_epi32() {
4270        let a = _mm_setr_epi32(0, 1, 2, 3);
4271        let b = _mm_setr_epi32(4, 5, 6, 7);
4272        let r = _mm_unpacklo_epi32(a, b);
4273        let e = _mm_setr_epi32(0, 4, 1, 5);
4274        assert_eq_m128i(r, e);
4275    }
4276
4277    #[simd_test(enable = "sse2")]
4278    unsafe fn test_mm_unpacklo_epi64() {
4279        let a = _mm_setr_epi64x(0, 1);
4280        let b = _mm_setr_epi64x(2, 3);
4281        let r = _mm_unpacklo_epi64(a, b);
4282        let e = _mm_setr_epi64x(0, 2);
4283        assert_eq_m128i(r, e);
4284    }
4285
4286    #[simd_test(enable = "sse2")]
4287    unsafe fn test_mm_add_sd() {
4288        let a = _mm_setr_pd(1.0, 2.0);
4289        let b = _mm_setr_pd(5.0, 10.0);
4290        let r = _mm_add_sd(a, b);
4291        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4292    }
4293
4294    #[simd_test(enable = "sse2")]
4295    unsafe fn test_mm_add_pd() {
4296        let a = _mm_setr_pd(1.0, 2.0);
4297        let b = _mm_setr_pd(5.0, 10.0);
4298        let r = _mm_add_pd(a, b);
4299        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4300    }
4301
4302    #[simd_test(enable = "sse2")]
4303    unsafe fn test_mm_div_sd() {
4304        let a = _mm_setr_pd(1.0, 2.0);
4305        let b = _mm_setr_pd(5.0, 10.0);
4306        let r = _mm_div_sd(a, b);
4307        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4308    }
4309
4310    #[simd_test(enable = "sse2")]
4311    unsafe fn test_mm_div_pd() {
4312        let a = _mm_setr_pd(1.0, 2.0);
4313        let b = _mm_setr_pd(5.0, 10.0);
4314        let r = _mm_div_pd(a, b);
4315        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4316    }
4317
4318    #[simd_test(enable = "sse2")]
4319    unsafe fn test_mm_max_sd() {
4320        let a = _mm_setr_pd(1.0, 2.0);
4321        let b = _mm_setr_pd(5.0, 10.0);
4322        let r = _mm_max_sd(a, b);
4323        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4324    }
4325
4326    #[simd_test(enable = "sse2")]
4327    unsafe fn test_mm_max_pd() {
4328        let a = _mm_setr_pd(1.0, 2.0);
4329        let b = _mm_setr_pd(5.0, 10.0);
4330        let r = _mm_max_pd(a, b);
4331        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4332
4333        // Check SSE(2)-specific semantics for -0.0 handling.
4334        let a = _mm_setr_pd(-0.0, 0.0);
4335        let b = _mm_setr_pd(0.0, 0.0);
4336        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4337        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4338        let a: [u8; 16] = transmute(a);
4339        let b: [u8; 16] = transmute(b);
4340        assert_eq!(r1, b);
4341        assert_eq!(r2, a);
4342        assert_ne!(a, b); // sanity check that -0.0 is actually present
4343    }
4344
4345    #[simd_test(enable = "sse2")]
4346    unsafe fn test_mm_min_sd() {
4347        let a = _mm_setr_pd(1.0, 2.0);
4348        let b = _mm_setr_pd(5.0, 10.0);
4349        let r = _mm_min_sd(a, b);
4350        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4351    }
4352
4353    #[simd_test(enable = "sse2")]
4354    unsafe fn test_mm_min_pd() {
4355        let a = _mm_setr_pd(1.0, 2.0);
4356        let b = _mm_setr_pd(5.0, 10.0);
4357        let r = _mm_min_pd(a, b);
4358        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4359
4360        // Check SSE(2)-specific semantics for -0.0 handling.
4361        let a = _mm_setr_pd(-0.0, 0.0);
4362        let b = _mm_setr_pd(0.0, 0.0);
4363        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4364        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4365        let a: [u8; 16] = transmute(a);
4366        let b: [u8; 16] = transmute(b);
4367        assert_eq!(r1, b);
4368        assert_eq!(r2, a);
4369        assert_ne!(a, b); // sanity check that -0.0 is actually present
4370    }
4371
4372    #[simd_test(enable = "sse2")]
4373    unsafe fn test_mm_mul_sd() {
4374        let a = _mm_setr_pd(1.0, 2.0);
4375        let b = _mm_setr_pd(5.0, 10.0);
4376        let r = _mm_mul_sd(a, b);
4377        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4378    }
4379
4380    #[simd_test(enable = "sse2")]
4381    unsafe fn test_mm_mul_pd() {
4382        let a = _mm_setr_pd(1.0, 2.0);
4383        let b = _mm_setr_pd(5.0, 10.0);
4384        let r = _mm_mul_pd(a, b);
4385        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4386    }
4387
4388    #[simd_test(enable = "sse2")]
4389    unsafe fn test_mm_sqrt_sd() {
4390        let a = _mm_setr_pd(1.0, 2.0);
4391        let b = _mm_setr_pd(5.0, 10.0);
4392        let r = _mm_sqrt_sd(a, b);
4393        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4394    }
4395
4396    #[simd_test(enable = "sse2")]
4397    unsafe fn test_mm_sqrt_pd() {
4398        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4399        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4400    }
4401
4402    #[simd_test(enable = "sse2")]
4403    unsafe fn test_mm_sub_sd() {
4404        let a = _mm_setr_pd(1.0, 2.0);
4405        let b = _mm_setr_pd(5.0, 10.0);
4406        let r = _mm_sub_sd(a, b);
4407        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4408    }
4409
4410    #[simd_test(enable = "sse2")]
4411    unsafe fn test_mm_sub_pd() {
4412        let a = _mm_setr_pd(1.0, 2.0);
4413        let b = _mm_setr_pd(5.0, 10.0);
4414        let r = _mm_sub_pd(a, b);
4415        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4416    }
4417
4418    #[simd_test(enable = "sse2")]
4419    unsafe fn test_mm_and_pd() {
4420        let a = transmute(u64x2::splat(5));
4421        let b = transmute(u64x2::splat(3));
4422        let r = _mm_and_pd(a, b);
4423        let e = transmute(u64x2::splat(1));
4424        assert_eq_m128d(r, e);
4425    }
4426
4427    #[simd_test(enable = "sse2")]
4428    unsafe fn test_mm_andnot_pd() {
4429        let a = transmute(u64x2::splat(5));
4430        let b = transmute(u64x2::splat(3));
4431        let r = _mm_andnot_pd(a, b);
4432        let e = transmute(u64x2::splat(2));
4433        assert_eq_m128d(r, e);
4434    }
4435
4436    #[simd_test(enable = "sse2")]
4437    unsafe fn test_mm_or_pd() {
4438        let a = transmute(u64x2::splat(5));
4439        let b = transmute(u64x2::splat(3));
4440        let r = _mm_or_pd(a, b);
4441        let e = transmute(u64x2::splat(7));
4442        assert_eq_m128d(r, e);
4443    }
4444
4445    #[simd_test(enable = "sse2")]
4446    unsafe fn test_mm_xor_pd() {
4447        let a = transmute(u64x2::splat(5));
4448        let b = transmute(u64x2::splat(3));
4449        let r = _mm_xor_pd(a, b);
4450        let e = transmute(u64x2::splat(6));
4451        assert_eq_m128d(r, e);
4452    }
4453
4454    #[simd_test(enable = "sse2")]
4455    unsafe fn test_mm_cmpeq_sd() {
4456        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4457        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4458        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4459        assert_eq_m128i(r, e);
4460    }
4461
4462    #[simd_test(enable = "sse2")]
4463    unsafe fn test_mm_cmplt_sd() {
4464        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4465        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4466        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4467        assert_eq_m128i(r, e);
4468    }
4469
4470    #[simd_test(enable = "sse2")]
4471    unsafe fn test_mm_cmple_sd() {
4472        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4473        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4474        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4475        assert_eq_m128i(r, e);
4476    }
4477
4478    #[simd_test(enable = "sse2")]
4479    unsafe fn test_mm_cmpgt_sd() {
4480        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4481        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4482        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4483        assert_eq_m128i(r, e);
4484    }
4485
4486    #[simd_test(enable = "sse2")]
4487    unsafe fn test_mm_cmpge_sd() {
4488        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4489        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4490        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4491        assert_eq_m128i(r, e);
4492    }
4493
4494    #[simd_test(enable = "sse2")]
4495    unsafe fn test_mm_cmpord_sd() {
4496        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4497        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4498        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4499        assert_eq_m128i(r, e);
4500    }
4501
4502    #[simd_test(enable = "sse2")]
4503    unsafe fn test_mm_cmpunord_sd() {
4504        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4505        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4506        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4507        assert_eq_m128i(r, e);
4508    }
4509
4510    #[simd_test(enable = "sse2")]
4511    unsafe fn test_mm_cmpneq_sd() {
4512        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4513        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4514        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4515        assert_eq_m128i(r, e);
4516    }
4517
4518    #[simd_test(enable = "sse2")]
4519    unsafe fn test_mm_cmpnlt_sd() {
4520        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4521        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4522        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4523        assert_eq_m128i(r, e);
4524    }
4525
4526    #[simd_test(enable = "sse2")]
4527    unsafe fn test_mm_cmpnle_sd() {
4528        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4529        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4530        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4531        assert_eq_m128i(r, e);
4532    }
4533
4534    #[simd_test(enable = "sse2")]
4535    unsafe fn test_mm_cmpngt_sd() {
4536        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4537        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4538        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4539        assert_eq_m128i(r, e);
4540    }
4541
4542    #[simd_test(enable = "sse2")]
4543    unsafe fn test_mm_cmpnge_sd() {
4544        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4545        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4546        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4547        assert_eq_m128i(r, e);
4548    }
4549
4550    #[simd_test(enable = "sse2")]
4551    unsafe fn test_mm_cmpeq_pd() {
4552        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4553        let e = _mm_setr_epi64x(!0, 0);
4554        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4555        assert_eq_m128i(r, e);
4556    }
4557
4558    #[simd_test(enable = "sse2")]
4559    unsafe fn test_mm_cmplt_pd() {
4560        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4561        let e = _mm_setr_epi64x(0, !0);
4562        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4563        assert_eq_m128i(r, e);
4564    }
4565
4566    #[simd_test(enable = "sse2")]
4567    unsafe fn test_mm_cmple_pd() {
4568        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4569        let e = _mm_setr_epi64x(!0, !0);
4570        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4571        assert_eq_m128i(r, e);
4572    }
4573
4574    #[simd_test(enable = "sse2")]
4575    unsafe fn test_mm_cmpgt_pd() {
4576        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4577        let e = _mm_setr_epi64x(0, 0);
4578        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4579        assert_eq_m128i(r, e);
4580    }
4581
4582    #[simd_test(enable = "sse2")]
4583    unsafe fn test_mm_cmpge_pd() {
4584        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4585        let e = _mm_setr_epi64x(!0, 0);
4586        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4587        assert_eq_m128i(r, e);
4588    }
4589
4590    #[simd_test(enable = "sse2")]
4591    unsafe fn test_mm_cmpord_pd() {
4592        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4593        let e = _mm_setr_epi64x(0, !0);
4594        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4595        assert_eq_m128i(r, e);
4596    }
4597
4598    #[simd_test(enable = "sse2")]
4599    unsafe fn test_mm_cmpunord_pd() {
4600        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4601        let e = _mm_setr_epi64x(!0, 0);
4602        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4603        assert_eq_m128i(r, e);
4604    }
4605
4606    #[simd_test(enable = "sse2")]
4607    unsafe fn test_mm_cmpneq_pd() {
4608        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4609        let e = _mm_setr_epi64x(!0, !0);
4610        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4611        assert_eq_m128i(r, e);
4612    }
4613
4614    #[simd_test(enable = "sse2")]
4615    unsafe fn test_mm_cmpnlt_pd() {
4616        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4617        let e = _mm_setr_epi64x(0, 0);
4618        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4619        assert_eq_m128i(r, e);
4620    }
4621
4622    #[simd_test(enable = "sse2")]
4623    unsafe fn test_mm_cmpnle_pd() {
4624        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4625        let e = _mm_setr_epi64x(0, 0);
4626        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4627        assert_eq_m128i(r, e);
4628    }
4629
4630    #[simd_test(enable = "sse2")]
4631    unsafe fn test_mm_cmpngt_pd() {
4632        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4633        let e = _mm_setr_epi64x(0, !0);
4634        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4635        assert_eq_m128i(r, e);
4636    }
4637
4638    #[simd_test(enable = "sse2")]
4639    unsafe fn test_mm_cmpnge_pd() {
4640        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4641        let e = _mm_setr_epi64x(0, !0);
4642        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4643        assert_eq_m128i(r, e);
4644    }
4645
4646    #[simd_test(enable = "sse2")]
4647    unsafe fn test_mm_comieq_sd() {
4648        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4649        assert!(_mm_comieq_sd(a, b) != 0);
4650
4651        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4652        assert!(_mm_comieq_sd(a, b) == 0);
4653    }
4654
4655    #[simd_test(enable = "sse2")]
4656    unsafe fn test_mm_comilt_sd() {
4657        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4658        assert!(_mm_comilt_sd(a, b) == 0);
4659    }
4660
4661    #[simd_test(enable = "sse2")]
4662    unsafe fn test_mm_comile_sd() {
4663        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4664        assert!(_mm_comile_sd(a, b) != 0);
4665    }
4666
4667    #[simd_test(enable = "sse2")]
4668    unsafe fn test_mm_comigt_sd() {
4669        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4670        assert!(_mm_comigt_sd(a, b) == 0);
4671    }
4672
4673    #[simd_test(enable = "sse2")]
4674    unsafe fn test_mm_comige_sd() {
4675        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4676        assert!(_mm_comige_sd(a, b) != 0);
4677    }
4678
4679    #[simd_test(enable = "sse2")]
4680    unsafe fn test_mm_comineq_sd() {
4681        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4682        assert!(_mm_comineq_sd(a, b) == 0);
4683    }
4684
4685    #[simd_test(enable = "sse2")]
4686    unsafe fn test_mm_ucomieq_sd() {
4687        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4688        assert!(_mm_ucomieq_sd(a, b) != 0);
4689
4690        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4691        assert!(_mm_ucomieq_sd(a, b) == 0);
4692    }
4693
4694    #[simd_test(enable = "sse2")]
4695    unsafe fn test_mm_ucomilt_sd() {
4696        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4697        assert!(_mm_ucomilt_sd(a, b) == 0);
4698    }
4699
4700    #[simd_test(enable = "sse2")]
4701    unsafe fn test_mm_ucomile_sd() {
4702        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4703        assert!(_mm_ucomile_sd(a, b) != 0);
4704    }
4705
4706    #[simd_test(enable = "sse2")]
4707    unsafe fn test_mm_ucomigt_sd() {
4708        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4709        assert!(_mm_ucomigt_sd(a, b) == 0);
4710    }
4711
4712    #[simd_test(enable = "sse2")]
4713    unsafe fn test_mm_ucomige_sd() {
4714        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4715        assert!(_mm_ucomige_sd(a, b) != 0);
4716    }
4717
4718    #[simd_test(enable = "sse2")]
4719    unsafe fn test_mm_ucomineq_sd() {
4720        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4721        assert!(_mm_ucomineq_sd(a, b) == 0);
4722    }
4723
4724    #[simd_test(enable = "sse2")]
4725    unsafe fn test_mm_movemask_pd() {
4726        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4727        assert_eq!(r, 0b01);
4728
4729        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4730        assert_eq!(r, 0b11);
4731    }
4732
4733    #[repr(align(16))]
4734    struct Memory {
4735        data: [f64; 4],
4736    }
4737
4738    #[simd_test(enable = "sse2")]
4739    unsafe fn test_mm_load_pd() {
4740        let mem = Memory {
4741            data: [1.0f64, 2.0, 3.0, 4.0],
4742        };
4743        let vals = &mem.data;
4744        let d = vals.as_ptr();
4745
4746        let r = _mm_load_pd(d);
4747        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4748    }
4749
4750    #[simd_test(enable = "sse2")]
4751    unsafe fn test_mm_load_sd() {
4752        let a = 1.;
4753        let expected = _mm_setr_pd(a, 0.);
4754        let r = _mm_load_sd(&a);
4755        assert_eq_m128d(r, expected);
4756    }
4757
4758    #[simd_test(enable = "sse2")]
4759    unsafe fn test_mm_loadh_pd() {
4760        let a = _mm_setr_pd(1., 2.);
4761        let b = 3.;
4762        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4763        let r = _mm_loadh_pd(a, &b);
4764        assert_eq_m128d(r, expected);
4765    }
4766
4767    #[simd_test(enable = "sse2")]
4768    unsafe fn test_mm_loadl_pd() {
4769        let a = _mm_setr_pd(1., 2.);
4770        let b = 3.;
4771        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4772        let r = _mm_loadl_pd(a, &b);
4773        assert_eq_m128d(r, expected);
4774    }
4775
4776    #[simd_test(enable = "sse2")]
4777    // Miri cannot support this until it is clear how it fits in the Rust memory model
4778    // (non-temporal store)
4779    #[cfg_attr(miri, ignore)]
4780    unsafe fn test_mm_stream_pd() {
4781        #[repr(align(128))]
4782        struct Memory {
4783            pub data: [f64; 2],
4784        }
4785        let a = _mm_set1_pd(7.0);
4786        let mut mem = Memory { data: [-1.0; 2] };
4787
4788        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4789        for i in 0..2 {
4790            assert_eq!(mem.data[i], get_m128d(a, i));
4791        }
4792    }
4793
4794    #[simd_test(enable = "sse2")]
4795    unsafe fn test_mm_store_sd() {
4796        let mut dest = 0.;
4797        let a = _mm_setr_pd(1., 2.);
4798        _mm_store_sd(&mut dest, a);
4799        assert_eq!(dest, _mm_cvtsd_f64(a));
4800    }
4801
4802    #[simd_test(enable = "sse2")]
4803    unsafe fn test_mm_store_pd() {
4804        let mut mem = Memory { data: [0.0f64; 4] };
4805        let vals = &mut mem.data;
4806        let a = _mm_setr_pd(1.0, 2.0);
4807        let d = vals.as_mut_ptr();
4808
4809        _mm_store_pd(d, *black_box(&a));
4810        assert_eq!(vals[0], 1.0);
4811        assert_eq!(vals[1], 2.0);
4812    }
4813
4814    #[simd_test(enable = "sse2")]
4815    unsafe fn test_mm_storeu_pd() {
4816        let mut mem = Memory { data: [0.0f64; 4] };
4817        let vals = &mut mem.data;
4818        let a = _mm_setr_pd(1.0, 2.0);
4819
4820        let mut ofs = 0;
4821        let mut p = vals.as_mut_ptr();
4822
4823        // Make sure p is **not** aligned to 16-byte boundary
4824        if (p as usize) & 0xf == 0 {
4825            ofs = 1;
4826            p = p.add(1);
4827        }
4828
4829        _mm_storeu_pd(p, *black_box(&a));
4830
4831        if ofs > 0 {
4832            assert_eq!(vals[ofs - 1], 0.0);
4833        }
4834        assert_eq!(vals[ofs + 0], 1.0);
4835        assert_eq!(vals[ofs + 1], 2.0);
4836    }
4837
4838    #[simd_test(enable = "sse2")]
4839    unsafe fn test_mm_storeu_si16() {
4840        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4841        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4842        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4843        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4844        assert_eq_m128i(r, e);
4845    }
4846
4847    #[simd_test(enable = "sse2")]
4848    unsafe fn test_mm_storeu_si32() {
4849        let a = _mm_setr_epi32(1, 2, 3, 4);
4850        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4851        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4852        let e = _mm_setr_epi32(1, 6, 7, 8);
4853        assert_eq_m128i(r, e);
4854    }
4855
4856    #[simd_test(enable = "sse2")]
4857    unsafe fn test_mm_storeu_si64() {
4858        let a = _mm_setr_epi64x(1, 2);
4859        let mut r = _mm_setr_epi64x(3, 4);
4860        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4861        let e = _mm_setr_epi64x(1, 4);
4862        assert_eq_m128i(r, e);
4863    }
4864
4865    #[simd_test(enable = "sse2")]
4866    unsafe fn test_mm_store1_pd() {
4867        let mut mem = Memory { data: [0.0f64; 4] };
4868        let vals = &mut mem.data;
4869        let a = _mm_setr_pd(1.0, 2.0);
4870        let d = vals.as_mut_ptr();
4871
4872        _mm_store1_pd(d, *black_box(&a));
4873        assert_eq!(vals[0], 1.0);
4874        assert_eq!(vals[1], 1.0);
4875    }
4876
4877    #[simd_test(enable = "sse2")]
4878    unsafe fn test_mm_store_pd1() {
4879        let mut mem = Memory { data: [0.0f64; 4] };
4880        let vals = &mut mem.data;
4881        let a = _mm_setr_pd(1.0, 2.0);
4882        let d = vals.as_mut_ptr();
4883
4884        _mm_store_pd1(d, *black_box(&a));
4885        assert_eq!(vals[0], 1.0);
4886        assert_eq!(vals[1], 1.0);
4887    }
4888
4889    #[simd_test(enable = "sse2")]
4890    unsafe fn test_mm_storer_pd() {
4891        let mut mem = Memory { data: [0.0f64; 4] };
4892        let vals = &mut mem.data;
4893        let a = _mm_setr_pd(1.0, 2.0);
4894        let d = vals.as_mut_ptr();
4895
4896        _mm_storer_pd(d, *black_box(&a));
4897        assert_eq!(vals[0], 2.0);
4898        assert_eq!(vals[1], 1.0);
4899    }
4900
4901    #[simd_test(enable = "sse2")]
4902    unsafe fn test_mm_storeh_pd() {
4903        let mut dest = 0.;
4904        let a = _mm_setr_pd(1., 2.);
4905        _mm_storeh_pd(&mut dest, a);
4906        assert_eq!(dest, get_m128d(a, 1));
4907    }
4908
4909    #[simd_test(enable = "sse2")]
4910    unsafe fn test_mm_storel_pd() {
4911        let mut dest = 0.;
4912        let a = _mm_setr_pd(1., 2.);
4913        _mm_storel_pd(&mut dest, a);
4914        assert_eq!(dest, _mm_cvtsd_f64(a));
4915    }
4916
4917    #[simd_test(enable = "sse2")]
4918    unsafe fn test_mm_loadr_pd() {
4919        let mut mem = Memory {
4920            data: [1.0f64, 2.0, 3.0, 4.0],
4921        };
4922        let vals = &mut mem.data;
4923        let d = vals.as_ptr();
4924
4925        let r = _mm_loadr_pd(d);
4926        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4927    }
4928
4929    #[simd_test(enable = "sse2")]
4930    unsafe fn test_mm_loadu_pd() {
4931        let mut mem = Memory {
4932            data: [1.0f64, 2.0, 3.0, 4.0],
4933        };
4934        let vals = &mut mem.data;
4935        let mut d = vals.as_ptr();
4936
4937        // make sure d is not aligned to 16-byte boundary
4938        let mut offset = 0;
4939        if (d as usize) & 0xf == 0 {
4940            offset = 1;
4941            d = d.add(offset);
4942        }
4943
4944        let r = _mm_loadu_pd(d);
4945        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4946        assert_eq_m128d(r, e);
4947    }
4948
4949    #[simd_test(enable = "sse2")]
4950    unsafe fn test_mm_loadu_si16() {
4951        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4952        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4953        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4954    }
4955
4956    #[simd_test(enable = "sse2")]
4957    unsafe fn test_mm_loadu_si32() {
4958        let a = _mm_setr_epi32(1, 2, 3, 4);
4959        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4960        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4961    }
4962
4963    #[simd_test(enable = "sse2")]
4964    unsafe fn test_mm_loadu_si64() {
4965        let a = _mm_setr_epi64x(5, 6);
4966        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
4967        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4968    }
4969
4970    #[simd_test(enable = "sse2")]
4971    unsafe fn test_mm_cvtpd_ps() {
4972        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
4973        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
4974
4975        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
4976        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
4977
4978        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
4979        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
4980
4981        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
4982        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
4983    }
4984
4985    #[simd_test(enable = "sse2")]
4986    unsafe fn test_mm_cvtps_pd() {
4987        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
4988        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
4989
4990        let r = _mm_cvtps_pd(_mm_setr_ps(
4991            f32::MAX,
4992            f32::INFINITY,
4993            f32::NEG_INFINITY,
4994            f32::MIN,
4995        ));
4996        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
4997    }
4998
4999    #[simd_test(enable = "sse2")]
5000    unsafe fn test_mm_cvtpd_epi32() {
5001        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5002        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5003
5004        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5005        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5006
5007        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5008        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5009
5010        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5011        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5012
5013        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5014        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5015    }
5016
5017    #[simd_test(enable = "sse2")]
5018    unsafe fn test_mm_cvtsd_si32() {
5019        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5020        assert_eq!(r, -2);
5021
5022        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5023        assert_eq!(r, i32::MIN);
5024
5025        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5026        assert_eq!(r, i32::MIN);
5027    }
5028
5029    #[simd_test(enable = "sse2")]
5030    unsafe fn test_mm_cvtsd_ss() {
5031        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5032        let b = _mm_setr_pd(2.0, -5.0);
5033
5034        let r = _mm_cvtsd_ss(a, b);
5035
5036        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5037
5038        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5039        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5040
5041        let r = _mm_cvtsd_ss(a, b);
5042
5043        assert_eq_m128(
5044            r,
5045            _mm_setr_ps(
5046                f32::INFINITY,
5047                f32::NEG_INFINITY,
5048                f32::MAX,
5049                f32::NEG_INFINITY,
5050            ),
5051        );
5052    }
5053
5054    #[simd_test(enable = "sse2")]
5055    unsafe fn test_mm_cvtsd_f64() {
5056        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5057        assert_eq!(r, -1.1);
5058    }
5059
5060    #[simd_test(enable = "sse2")]
5061    unsafe fn test_mm_cvtss_sd() {
5062        let a = _mm_setr_pd(-1.1, 2.2);
5063        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5064
5065        let r = _mm_cvtss_sd(a, b);
5066        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5067
5068        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5069        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5070
5071        let r = _mm_cvtss_sd(a, b);
5072        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5073    }
5074
5075    #[simd_test(enable = "sse2")]
5076    unsafe fn test_mm_cvttpd_epi32() {
5077        let a = _mm_setr_pd(-1.1, 2.2);
5078        let r = _mm_cvttpd_epi32(a);
5079        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5080
5081        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5082        let r = _mm_cvttpd_epi32(a);
5083        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5084    }
5085
5086    #[simd_test(enable = "sse2")]
5087    unsafe fn test_mm_cvttsd_si32() {
5088        let a = _mm_setr_pd(-1.1, 2.2);
5089        let r = _mm_cvttsd_si32(a);
5090        assert_eq!(r, -1);
5091
5092        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5093        let r = _mm_cvttsd_si32(a);
5094        assert_eq!(r, i32::MIN);
5095    }
5096
5097    #[simd_test(enable = "sse2")]
5098    unsafe fn test_mm_cvttps_epi32() {
5099        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5100        let r = _mm_cvttps_epi32(a);
5101        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5102
5103        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5104        let r = _mm_cvttps_epi32(a);
5105        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5106    }
5107
5108    #[simd_test(enable = "sse2")]
5109    unsafe fn test_mm_set_sd() {
5110        let r = _mm_set_sd(-1.0_f64);
5111        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5112    }
5113
5114    #[simd_test(enable = "sse2")]
5115    unsafe fn test_mm_set1_pd() {
5116        let r = _mm_set1_pd(-1.0_f64);
5117        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5118    }
5119
5120    #[simd_test(enable = "sse2")]
5121    unsafe fn test_mm_set_pd1() {
5122        let r = _mm_set_pd1(-2.0_f64);
5123        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5124    }
5125
5126    #[simd_test(enable = "sse2")]
5127    unsafe fn test_mm_set_pd() {
5128        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5129        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5130    }
5131
5132    #[simd_test(enable = "sse2")]
5133    unsafe fn test_mm_setr_pd() {
5134        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5135        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5136    }
5137
5138    #[simd_test(enable = "sse2")]
5139    unsafe fn test_mm_setzero_pd() {
5140        let r = _mm_setzero_pd();
5141        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5142    }
5143
5144    #[simd_test(enable = "sse2")]
5145    unsafe fn test_mm_load1_pd() {
5146        let d = -5.0;
5147        let r = _mm_load1_pd(&d);
5148        assert_eq_m128d(r, _mm_setr_pd(d, d));
5149    }
5150
5151    #[simd_test(enable = "sse2")]
5152    unsafe fn test_mm_load_pd1() {
5153        let d = -5.0;
5154        let r = _mm_load_pd1(&d);
5155        assert_eq_m128d(r, _mm_setr_pd(d, d));
5156    }
5157
5158    #[simd_test(enable = "sse2")]
5159    unsafe fn test_mm_unpackhi_pd() {
5160        let a = _mm_setr_pd(1.0, 2.0);
5161        let b = _mm_setr_pd(3.0, 4.0);
5162        let r = _mm_unpackhi_pd(a, b);
5163        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5164    }
5165
5166    #[simd_test(enable = "sse2")]
5167    unsafe fn test_mm_unpacklo_pd() {
5168        let a = _mm_setr_pd(1.0, 2.0);
5169        let b = _mm_setr_pd(3.0, 4.0);
5170        let r = _mm_unpacklo_pd(a, b);
5171        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5172    }
5173
5174    #[simd_test(enable = "sse2")]
5175    unsafe fn test_mm_shuffle_pd() {
5176        let a = _mm_setr_pd(1., 2.);
5177        let b = _mm_setr_pd(3., 4.);
5178        let expected = _mm_setr_pd(1., 3.);
5179        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5180        assert_eq_m128d(r, expected);
5181    }
5182
5183    #[simd_test(enable = "sse2")]
5184    unsafe fn test_mm_move_sd() {
5185        let a = _mm_setr_pd(1., 2.);
5186        let b = _mm_setr_pd(3., 4.);
5187        let expected = _mm_setr_pd(3., 2.);
5188        let r = _mm_move_sd(a, b);
5189        assert_eq_m128d(r, expected);
5190    }
5191
5192    #[simd_test(enable = "sse2")]
5193    unsafe fn test_mm_castpd_ps() {
5194        let a = _mm_set1_pd(0.);
5195        let expected = _mm_set1_ps(0.);
5196        let r = _mm_castpd_ps(a);
5197        assert_eq_m128(r, expected);
5198    }
5199
5200    #[simd_test(enable = "sse2")]
5201    unsafe fn test_mm_castpd_si128() {
5202        let a = _mm_set1_pd(0.);
5203        let expected = _mm_set1_epi64x(0);
5204        let r = _mm_castpd_si128(a);
5205        assert_eq_m128i(r, expected);
5206    }
5207
5208    #[simd_test(enable = "sse2")]
5209    unsafe fn test_mm_castps_pd() {
5210        let a = _mm_set1_ps(0.);
5211        let expected = _mm_set1_pd(0.);
5212        let r = _mm_castps_pd(a);
5213        assert_eq_m128d(r, expected);
5214    }
5215
5216    #[simd_test(enable = "sse2")]
5217    unsafe fn test_mm_castps_si128() {
5218        let a = _mm_set1_ps(0.);
5219        let expected = _mm_set1_epi32(0);
5220        let r = _mm_castps_si128(a);
5221        assert_eq_m128i(r, expected);
5222    }
5223
5224    #[simd_test(enable = "sse2")]
5225    unsafe fn test_mm_castsi128_pd() {
5226        let a = _mm_set1_epi64x(0);
5227        let expected = _mm_set1_pd(0.);
5228        let r = _mm_castsi128_pd(a);
5229        assert_eq_m128d(r, expected);
5230    }
5231
5232    #[simd_test(enable = "sse2")]
5233    unsafe fn test_mm_castsi128_ps() {
5234        let a = _mm_set1_epi32(0);
5235        let expected = _mm_set1_ps(0.);
5236        let r = _mm_castsi128_ps(a);
5237        assert_eq_m128(r, expected);
5238    }
5239}