clang 22.0.0git
smmintrin.h
Go to the documentation of this file.
1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __SMMINTRIN_H
11#define __SMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <tmmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
20#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
21#define __DEFAULT_FN_ATTRS \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
24#else
25#define __DEFAULT_FN_ATTRS \
26 __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
27 __min_vector_width__(128)))
28#endif
29
30#if defined(__cplusplus) && (__cplusplus >= 201103L)
31#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
32#else
33#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
34#endif
35
36/* SSE4 Rounding macros. */
37#define _MM_FROUND_TO_NEAREST_INT 0x00
38#define _MM_FROUND_TO_NEG_INF 0x01
39#define _MM_FROUND_TO_POS_INF 0x02
40#define _MM_FROUND_TO_ZERO 0x03
41#define _MM_FROUND_CUR_DIRECTION 0x04
42
43#define _MM_FROUND_RAISE_EXC 0x00
44#define _MM_FROUND_NO_EXC 0x08
45
46#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
47#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
48#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
49#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
50#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
51#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
52
53/// Rounds up each element of the 128-bit vector of [4 x float] to an
54/// integer and returns the rounded values in a 128-bit vector of
55/// [4 x float].
56///
57/// \headerfile <x86intrin.h>
58///
59/// \code
60/// __m128 _mm_ceil_ps(__m128 X);
61/// \endcode
62///
63/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
64///
65/// \param X
66/// A 128-bit vector of [4 x float] values to be rounded up.
67/// \returns A 128-bit vector of [4 x float] containing the rounded values.
68#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
69
70/// Rounds up each element of the 128-bit vector of [2 x double] to an
71/// integer and returns the rounded values in a 128-bit vector of
72/// [2 x double].
73///
74/// \headerfile <x86intrin.h>
75///
76/// \code
77/// __m128d _mm_ceil_pd(__m128d X);
78/// \endcode
79///
80/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
81///
82/// \param X
83/// A 128-bit vector of [2 x double] values to be rounded up.
84/// \returns A 128-bit vector of [2 x double] containing the rounded values.
85#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
86
87/// Copies three upper elements of the first 128-bit vector operand to
88/// the corresponding three upper elements of the 128-bit result vector of
89/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
90/// operand to an integer and copies it to the lowest element of the 128-bit
91/// result vector of [4 x float].
92///
93/// \headerfile <x86intrin.h>
94///
95/// \code
96/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
97/// \endcode
98///
99/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
100///
101/// \param X
102/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
103/// copied to the corresponding bits of the result.
104/// \param Y
105/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
106/// rounded up to the nearest integer and copied to the corresponding bits
107/// of the result.
108/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
109/// values.
110#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
111
112/// Copies the upper element of the first 128-bit vector operand to the
113/// corresponding upper element of the 128-bit result vector of [2 x double].
114/// Rounds up the lower element of the second 128-bit vector operand to an
115/// integer and copies it to the lower element of the 128-bit result vector
116/// of [2 x double].
117///
118/// \headerfile <x86intrin.h>
119///
120/// \code
121/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
122/// \endcode
123///
124/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
125///
126/// \param X
127/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
128/// copied to the corresponding bits of the result.
129/// \param Y
130/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
131/// rounded up to the nearest integer and copied to the corresponding bits
132/// of the result.
133/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
134/// values.
135#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
136
137/// Rounds down each element of the 128-bit vector of [4 x float] to an
138/// an integer and returns the rounded values in a 128-bit vector of
139/// [4 x float].
140///
141/// \headerfile <x86intrin.h>
142///
143/// \code
144/// __m128 _mm_floor_ps(__m128 X);
145/// \endcode
146///
147/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
148///
149/// \param X
150/// A 128-bit vector of [4 x float] values to be rounded down.
151/// \returns A 128-bit vector of [4 x float] containing the rounded values.
152#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
153
154/// Rounds down each element of the 128-bit vector of [2 x double] to an
155/// integer and returns the rounded values in a 128-bit vector of
156/// [2 x double].
157///
158/// \headerfile <x86intrin.h>
159///
160/// \code
161/// __m128d _mm_floor_pd(__m128d X);
162/// \endcode
163///
164/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
165///
166/// \param X
167/// A 128-bit vector of [2 x double].
168/// \returns A 128-bit vector of [2 x double] containing the rounded values.
169#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
170
171/// Copies three upper elements of the first 128-bit vector operand to
172/// the corresponding three upper elements of the 128-bit result vector of
173/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
174/// operand to an integer and copies it to the lowest element of the 128-bit
175/// result vector of [4 x float].
176///
177/// \headerfile <x86intrin.h>
178///
179/// \code
180/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
181/// \endcode
182///
183/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
184///
185/// \param X
186/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
187/// copied to the corresponding bits of the result.
188/// \param Y
189/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
190/// rounded down to the nearest integer and copied to the corresponding bits
191/// of the result.
192/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
193/// values.
194#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
195
196/// Copies the upper element of the first 128-bit vector operand to the
197/// corresponding upper element of the 128-bit result vector of [2 x double].
198/// Rounds down the lower element of the second 128-bit vector operand to an
199/// integer and copies it to the lower element of the 128-bit result vector
200/// of [2 x double].
201///
202/// \headerfile <x86intrin.h>
203///
204/// \code
205/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
206/// \endcode
207///
208/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
209///
210/// \param X
211/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
212/// copied to the corresponding bits of the result.
213/// \param Y
214/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
215/// rounded down to the nearest integer and copied to the corresponding bits
216/// of the result.
217/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
218/// values.
219#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
220
221/// Rounds each element of the 128-bit vector of [4 x float] to an
222/// integer value according to the rounding control specified by the second
223/// argument and returns the rounded values in a 128-bit vector of
224/// [4 x float].
225///
226/// \headerfile <x86intrin.h>
227///
228/// \code
229/// __m128 _mm_round_ps(__m128 X, const int M);
230/// \endcode
231///
232/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
233///
234/// \param X
235/// A 128-bit vector of [4 x float].
236/// \param M
237/// An integer value that specifies the rounding operation. \n
238/// Bits [7:4] are reserved. \n
239/// Bit [3] is a precision exception value: \n
240/// 0: A normal PE exception is used \n
241/// 1: The PE field is not updated \n
242/// Bit [2] is the rounding control source: \n
243/// 0: Use bits [1:0] of \a M \n
244/// 1: Use the current MXCSR setting \n
245/// Bits [1:0] contain the rounding control definition: \n
246/// 00: Nearest \n
247/// 01: Downward (toward negative infinity) \n
248/// 10: Upward (toward positive infinity) \n
249/// 11: Truncated
250/// \returns A 128-bit vector of [4 x float] containing the rounded values.
251#define _mm_round_ps(X, M) \
252 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
253
254/// Copies three upper elements of the first 128-bit vector operand to
255/// the corresponding three upper elements of the 128-bit result vector of
256/// [4 x float]. Rounds the lowest element of the second 128-bit vector
257/// operand to an integer value according to the rounding control specified
258/// by the third argument and copies it to the lowest element of the 128-bit
259/// result vector of [4 x float].
260///
261/// \headerfile <x86intrin.h>
262///
263/// \code
264/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
265/// \endcode
266///
267/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
268///
269/// \param X
270/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
271/// copied to the corresponding bits of the result.
272/// \param Y
273/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
274/// rounded to the nearest integer using the specified rounding control and
275/// copied to the corresponding bits of the result.
276/// \param M
277/// An integer value that specifies the rounding operation. \n
278/// Bits [7:4] are reserved. \n
279/// Bit [3] is a precision exception value: \n
280/// 0: A normal PE exception is used \n
281/// 1: The PE field is not updated \n
282/// Bit [2] is the rounding control source: \n
283/// 0: Use bits [1:0] of \a M \n
284/// 1: Use the current MXCSR setting \n
285/// Bits [1:0] contain the rounding control definition: \n
286/// 00: Nearest \n
287/// 01: Downward (toward negative infinity) \n
288/// 10: Upward (toward positive infinity) \n
289/// 11: Truncated
290/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
291/// values.
292#define _mm_round_ss(X, Y, M) \
293 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
294 (M)))
295
296/// Rounds each element of the 128-bit vector of [2 x double] to an
297/// integer value according to the rounding control specified by the second
298/// argument and returns the rounded values in a 128-bit vector of
299/// [2 x double].
300///
301/// \headerfile <x86intrin.h>
302///
303/// \code
304/// __m128d _mm_round_pd(__m128d X, const int M);
305/// \endcode
306///
307/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
308///
309/// \param X
310/// A 128-bit vector of [2 x double].
311/// \param M
312/// An integer value that specifies the rounding operation. \n
313/// Bits [7:4] are reserved. \n
314/// Bit [3] is a precision exception value: \n
315/// 0: A normal PE exception is used \n
316/// 1: The PE field is not updated \n
317/// Bit [2] is the rounding control source: \n
318/// 0: Use bits [1:0] of \a M \n
319/// 1: Use the current MXCSR setting \n
320/// Bits [1:0] contain the rounding control definition: \n
321/// 00: Nearest \n
322/// 01: Downward (toward negative infinity) \n
323/// 10: Upward (toward positive infinity) \n
324/// 11: Truncated
325/// \returns A 128-bit vector of [2 x double] containing the rounded values.
326#define _mm_round_pd(X, M) \
327 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
328
329/// Copies the upper element of the first 128-bit vector operand to the
330/// corresponding upper element of the 128-bit result vector of [2 x double].
331/// Rounds the lower element of the second 128-bit vector operand to an
332/// integer value according to the rounding control specified by the third
333/// argument and copies it to the lower element of the 128-bit result vector
334/// of [2 x double].
335///
336/// \headerfile <x86intrin.h>
337///
338/// \code
339/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
340/// \endcode
341///
342/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
343///
344/// \param X
345/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
346/// copied to the corresponding bits of the result.
347/// \param Y
348/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
349/// rounded to the nearest integer using the specified rounding control and
350/// copied to the corresponding bits of the result.
351/// \param M
352/// An integer value that specifies the rounding operation. \n
353/// Bits [7:4] are reserved. \n
354/// Bit [3] is a precision exception value: \n
355/// 0: A normal PE exception is used \n
356/// 1: The PE field is not updated \n
357/// Bit [2] is the rounding control source: \n
358/// 0: Use bits [1:0] of \a M \n
359/// 1: Use the current MXCSR setting \n
360/// Bits [1:0] contain the rounding control definition: \n
361/// 00: Nearest \n
362/// 01: Downward (toward negative infinity) \n
363/// 10: Upward (toward positive infinity) \n
364/// 11: Truncated
365/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
366/// values.
367#define _mm_round_sd(X, Y, M) \
368 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
369 (M)))
370
371/* SSE4 Packed Blending Intrinsics. */
372/// Returns a 128-bit vector of [2 x double] where the values are
373/// selected from either the first or second operand as specified by the
374/// third operand, the control mask.
375///
376/// \headerfile <x86intrin.h>
377///
378/// \code
379/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
380/// \endcode
381///
382/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
383///
384/// \param V1
385/// A 128-bit vector of [2 x double].
386/// \param V2
387/// A 128-bit vector of [2 x double].
388/// \param M
389/// An immediate integer operand, with mask bits [1:0] specifying how the
390/// values are to be copied. The position of the mask bit corresponds to the
391/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
392/// element in operand \a V1 is copied to the same position in the result.
393/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
394/// is copied to the same position in the result.
395/// \returns A 128-bit vector of [2 x double] containing the copied values.
396#define _mm_blend_pd(V1, V2, M) \
397 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
398 (__v2df)(__m128d)(V2), (int)(M)))
399
400/// Returns a 128-bit vector of [4 x float] where the values are selected
401/// from either the first or second operand as specified by the third
402/// operand, the control mask.
403///
404/// \headerfile <x86intrin.h>
405///
406/// \code
407/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
408/// \endcode
409///
410/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
411///
412/// \param V1
413/// A 128-bit vector of [4 x float].
414/// \param V2
415/// A 128-bit vector of [4 x float].
416/// \param M
417/// An immediate integer operand, with mask bits [3:0] specifying how the
418/// values are to be copied. The position of the mask bit corresponds to the
419/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
420/// element in operand \a V1 is copied to the same position in the result.
421/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
422/// is copied to the same position in the result.
423/// \returns A 128-bit vector of [4 x float] containing the copied values.
424#define _mm_blend_ps(V1, V2, M) \
425 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
426 (int)(M)))
427
428/// Returns a 128-bit vector of [2 x double] where the values are
429/// selected from either the first or second operand as specified by the
430/// third operand, the control mask.
431///
432/// \headerfile <x86intrin.h>
433///
434/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
435///
436/// \param __V1
437/// A 128-bit vector of [2 x double].
438/// \param __V2
439/// A 128-bit vector of [2 x double].
440/// \param __M
441/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
442/// values are to be copied. The position of the mask bit corresponds to the
443/// most significant bit of a copied value. When a mask bit is 0, the
444/// corresponding 64-bit element in operand \a __V1 is copied to the same
445/// position in the result. When a mask bit is 1, the corresponding 64-bit
446/// element in operand \a __V2 is copied to the same position in the result.
447/// \returns A 128-bit vector of [2 x double] containing the copied values.
448static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
449 __m128d __V2,
450 __m128d __M) {
451 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
452 (__v2df)__M);
453}
454
455/// Returns a 128-bit vector of [4 x float] where the values are
456/// selected from either the first or second operand as specified by the
457/// third operand, the control mask.
458///
459/// \headerfile <x86intrin.h>
460///
461/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
462///
463/// \param __V1
464/// A 128-bit vector of [4 x float].
465/// \param __V2
466/// A 128-bit vector of [4 x float].
467/// \param __M
468/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
469/// how the values are to be copied. The position of the mask bit corresponds
470/// to the most significant bit of a copied value. When a mask bit is 0, the
471/// corresponding 32-bit element in operand \a __V1 is copied to the same
472/// position in the result. When a mask bit is 1, the corresponding 32-bit
473/// element in operand \a __V2 is copied to the same position in the result.
474/// \returns A 128-bit vector of [4 x float] containing the copied values.
475static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
476 __m128 __V2,
477 __m128 __M) {
478 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
479 (__v4sf)__M);
480}
481
482/// Returns a 128-bit vector of [16 x i8] where the values are selected
483/// from either of the first or second operand as specified by the third
484/// operand, the control mask.
485///
486/// \headerfile <x86intrin.h>
487///
488/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
489///
490/// \param __V1
491/// A 128-bit vector of [16 x i8].
492/// \param __V2
493/// A 128-bit vector of [16 x i8].
494/// \param __M
495/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
496/// how the values are to be copied. The position of the mask bit corresponds
497/// to the most significant bit of a copied value. When a mask bit is 0, the
498/// corresponding 8-bit element in operand \a __V1 is copied to the same
499/// position in the result. When a mask bit is 1, the corresponding 8-bit
500/// element in operand \a __V2 is copied to the same position in the result.
501/// \returns A 128-bit vector of [16 x i8] containing the copied values.
502static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
503 __m128i __V2,
504 __m128i __M) {
505 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
506 (__v16qi)__M);
507}
508
509/// Returns a 128-bit vector of [8 x i16] where the values are selected
510/// from either of the first or second operand as specified by the third
511/// operand, the control mask.
512///
513/// \headerfile <x86intrin.h>
514///
515/// \code
516/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
517/// \endcode
518///
519/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
520///
521/// \param V1
522/// A 128-bit vector of [8 x i16].
523/// \param V2
524/// A 128-bit vector of [8 x i16].
525/// \param M
526/// An immediate integer operand, with mask bits [7:0] specifying how the
527/// values are to be copied. The position of the mask bit corresponds to the
528/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
529/// element in operand \a V1 is copied to the same position in the result.
530/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
531/// is copied to the same position in the result.
532/// \returns A 128-bit vector of [8 x i16] containing the copied values.
533#define _mm_blend_epi16(V1, V2, M) \
534 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
535 (__v8hi)(__m128i)(V2), (int)(M)))
536
537/* SSE4 Dword Multiply Instructions. */
538/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
539/// and returns the lower 32 bits of the each product in a 128-bit vector of
540/// [4 x i32].
541///
542/// \headerfile <x86intrin.h>
543///
544/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
545///
546/// \param __V1
547/// A 128-bit integer vector.
548/// \param __V2
549/// A 128-bit integer vector.
550/// \returns A 128-bit integer vector containing the products of both operands.
551static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
552_mm_mullo_epi32(__m128i __V1, __m128i __V2) {
553 return (__m128i)((__v4su)__V1 * (__v4su)__V2);
554}
555
556/// Multiplies corresponding even-indexed elements of two 128-bit
557/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
558/// containing the products.
559///
560/// \headerfile <x86intrin.h>
561///
562/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
563///
564/// \param __V1
565/// A 128-bit vector of [4 x i32].
566/// \param __V2
567/// A 128-bit vector of [4 x i32].
568/// \returns A 128-bit vector of [2 x i64] containing the products of both
569/// operands.
570static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
571_mm_mul_epi32(__m128i __V1, __m128i __V2) {
572 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
573}
574
575/* SSE4 Floating Point Dot Product Instructions. */
576/// Computes the dot product of the two 128-bit vectors of [4 x float]
577/// and returns it in the elements of the 128-bit result vector of
578/// [4 x float].
579///
580/// The immediate integer operand controls which input elements
581/// will contribute to the dot product, and where the final results are
582/// returned.
583///
584/// \headerfile <x86intrin.h>
585///
586/// \code
587/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
588/// \endcode
589///
590/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
591///
592/// \param X
593/// A 128-bit vector of [4 x float].
594/// \param Y
595/// A 128-bit vector of [4 x float].
596/// \param M
597/// An immediate integer operand. Mask bits [7:4] determine which elements
598/// of the input vectors are used, with bit [4] corresponding to the lowest
599/// element and bit [7] corresponding to the highest element of each [4 x
600/// float] vector. If a bit is set, the corresponding elements from the two
601/// input vectors are used as an input for dot product; otherwise that input
602/// is treated as zero. Bits [3:0] determine which elements of the result
603/// will receive a copy of the final dot product, with bit [0] corresponding
604/// to the lowest element and bit [3] corresponding to the highest element of
605/// each [4 x float] subvector. If a bit is set, the dot product is returned
606/// in the corresponding element; otherwise that element is set to zero.
607/// \returns A 128-bit vector of [4 x float] containing the dot product.
608#define _mm_dp_ps(X, Y, M) \
609 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
610
611/// Computes the dot product of the two 128-bit vectors of [2 x double]
612/// and returns it in the elements of the 128-bit result vector of
613/// [2 x double].
614///
615/// The immediate integer operand controls which input
616/// elements will contribute to the dot product, and where the final results
617/// are returned.
618///
619/// \headerfile <x86intrin.h>
620///
621/// \code
622/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
623/// \endcode
624///
625/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
626///
627/// \param X
628/// A 128-bit vector of [2 x double].
629/// \param Y
630/// A 128-bit vector of [2 x double].
631/// \param M
632/// An immediate integer operand. Mask bits [5:4] determine which elements
633/// of the input vectors are used, with bit [4] corresponding to the lowest
634/// element and bit [5] corresponding to the highest element of each of [2 x
635/// double] vector. If a bit is set, the corresponding elements from the two
636/// input vectors are used as an input for dot product; otherwise that input
637/// is treated as zero. Bits [1:0] determine which elements of the result
638/// will receive a copy of the final dot product, with bit [0] corresponding
639/// to the lowest element and bit [1] corresponding to the highest element of
640/// each [2 x double] vector. If a bit is set, the dot product is returned in
641/// the corresponding element; otherwise that element is set to zero.
642#define _mm_dp_pd(X, Y, M) \
643 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
644 (M)))
645
646/* SSE4 Streaming Load Hint Instruction. */
647/// Loads integer values from a 128-bit aligned memory location to a
648/// 128-bit integer vector.
649///
650/// \headerfile <x86intrin.h>
651///
652/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
653///
654/// \param __V
655/// A pointer to a 128-bit aligned memory location that contains the integer
656/// values.
657/// \returns A 128-bit integer vector containing the data stored at the
658/// specified memory location.
659static __inline__ __m128i __DEFAULT_FN_ATTRS
660_mm_stream_load_si128(const void *__V) {
661 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
662}
663
664/* SSE4 Packed Integer Min/Max Instructions. */
665/// Compares the corresponding elements of two 128-bit vectors of
666/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
667/// of the two values.
668///
669/// \headerfile <x86intrin.h>
670///
671/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
672///
673/// \param __V1
674/// A 128-bit vector of [16 x i8].
675/// \param __V2
676/// A 128-bit vector of [16 x i8]
677/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
678static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
679 __m128i __V2) {
680 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
681}
682
683/// Compares the corresponding elements of two 128-bit vectors of
684/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
685/// greater value of the two.
686///
687/// \headerfile <x86intrin.h>
688///
689/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
690///
691/// \param __V1
692/// A 128-bit vector of [16 x i8].
693/// \param __V2
694/// A 128-bit vector of [16 x i8].
695/// \returns A 128-bit vector of [16 x i8] containing the greater values.
696static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
697 __m128i __V2) {
698 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
699}
700
701/// Compares the corresponding elements of two 128-bit vectors of
702/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
703/// value of the two.
704///
705/// \headerfile <x86intrin.h>
706///
707/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
708///
709/// \param __V1
710/// A 128-bit vector of [8 x u16].
711/// \param __V2
712/// A 128-bit vector of [8 x u16].
713/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
714static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
715 __m128i __V2) {
716 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
717}
718
719/// Compares the corresponding elements of two 128-bit vectors of
720/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
721/// greater value of the two.
722///
723/// \headerfile <x86intrin.h>
724///
725/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
726///
727/// \param __V1
728/// A 128-bit vector of [8 x u16].
729/// \param __V2
730/// A 128-bit vector of [8 x u16].
731/// \returns A 128-bit vector of [8 x u16] containing the greater values.
732static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
733 __m128i __V2) {
734 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
735}
736
737/// Compares the corresponding elements of two 128-bit vectors of
738/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
739/// value of the two.
740///
741/// \headerfile <x86intrin.h>
742///
743/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
744///
745/// \param __V1
746/// A 128-bit vector of [4 x i32].
747/// \param __V2
748/// A 128-bit vector of [4 x i32].
749/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
750static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
751 __m128i __V2) {
752 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
753}
754
755/// Compares the corresponding elements of two 128-bit vectors of
756/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
757/// greater value of the two.
758///
759/// \headerfile <x86intrin.h>
760///
761/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
762///
763/// \param __V1
764/// A 128-bit vector of [4 x i32].
765/// \param __V2
766/// A 128-bit vector of [4 x i32].
767/// \returns A 128-bit vector of [4 x i32] containing the greater values.
768static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
769 __m128i __V2) {
770 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
771}
772
773/// Compares the corresponding elements of two 128-bit vectors of
774/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
775/// value of the two.
776///
777/// \headerfile <x86intrin.h>
778///
779/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
780///
781/// \param __V1
782/// A 128-bit vector of [4 x u32].
783/// \param __V2
784/// A 128-bit vector of [4 x u32].
785/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
786static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
787 __m128i __V2) {
788 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
789}
790
791/// Compares the corresponding elements of two 128-bit vectors of
792/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
793/// greater value of the two.
794///
795/// \headerfile <x86intrin.h>
796///
797/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
798///
799/// \param __V1
800/// A 128-bit vector of [4 x u32].
801/// \param __V2
802/// A 128-bit vector of [4 x u32].
803/// \returns A 128-bit vector of [4 x u32] containing the greater values.
804static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
805 __m128i __V2) {
806 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
807}
808
809/* SSE4 Insertion and Extraction from XMM Register Instructions. */
810/// Takes the first argument \a X and inserts an element from the second
811/// argument \a Y as selected by the third argument \a N. That result then
812/// has elements zeroed out also as selected by the third argument \a N. The
813/// resulting 128-bit vector of [4 x float] is then returned.
814///
815/// \headerfile <x86intrin.h>
816///
817/// \code
818/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
819/// \endcode
820///
821/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
822///
823/// \param X
824/// A 128-bit vector source operand of [4 x float]. With the exception of
825/// those bits in the result copied from parameter \a Y and zeroed by bits
826/// [3:0] of \a N, all bits from this parameter are copied to the result.
827/// \param Y
828/// A 128-bit vector source operand of [4 x float]. One single-precision
829/// floating-point element from this source, as determined by the immediate
830/// parameter, is copied to the result.
831/// \param N
832/// Specifies which bits from operand \a Y will be copied, which bits in the
833/// result they will be copied to, and which bits in the result will be
834/// cleared. The following assignments are made: \n
835/// Bits [7:6] specify the bits to copy from operand \a Y: \n
836/// 00: Selects bits [31:0] from operand \a Y. \n
837/// 01: Selects bits [63:32] from operand \a Y. \n
838/// 10: Selects bits [95:64] from operand \a Y. \n
839/// 11: Selects bits [127:96] from operand \a Y. \n
840/// Bits [5:4] specify the bits in the result to which the selected bits
841/// from operand \a Y are copied: \n
842/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
843/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
844/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
845/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
846/// Bits[3:0]: If any of these bits are set, the corresponding result
847/// element is cleared.
848/// \returns A 128-bit vector of [4 x float] containing the copied
849/// single-precision floating point elements from the operands.
850#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
851
852/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
853/// returns it, using the immediate value parameter \a N as a selector.
854///
855/// \headerfile <x86intrin.h>
856///
857/// \code
858/// int _mm_extract_ps(__m128 X, const int N);
859/// \endcode
860///
861/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
862/// instruction.
863///
864/// \param X
865/// A 128-bit vector of [4 x float].
866/// \param N
867/// An immediate value. Bits [1:0] determines which bits from the argument
868/// \a X are extracted and returned: \n
869/// 00: Bits [31:0] of parameter \a X are returned. \n
870/// 01: Bits [63:32] of parameter \a X are returned. \n
871/// 10: Bits [95:64] of parameter \a X are returned. \n
872/// 11: Bits [127:96] of parameter \a X are returned.
873/// \returns A 32-bit integer containing the extracted 32 bits of float data.
874#define _mm_extract_ps(X, N) \
875 __builtin_bit_cast( \
876 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
877
878/* Miscellaneous insert and extract macros. */
879/* Extract a single-precision float from X at index N into D. */
880#define _MM_EXTRACT_FLOAT(D, X, N) \
881 do { \
882 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
883 } while (0)
884
885/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
886 an index suitable for _mm_insert_ps. */
887#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
888
889/* Extract a float from X at index N into the first index of the return. */
890#define _MM_PICK_OUT_PS(X, N) \
891 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
892
893/* Insert int into packed integer array at index. */
894/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
895/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
896/// of an integer parameter \a I into an offset specified by the immediate
897/// value parameter \a N.
898///
899/// \headerfile <x86intrin.h>
900///
901/// \code
902/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
903/// \endcode
904///
905/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
906///
907/// \param X
908/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
909/// result and then one of the sixteen elements in the result vector is
910/// replaced by the lower 8 bits of \a I.
911/// \param I
912/// An integer. The lower 8 bits of this operand are written to the result
913/// beginning at the offset specified by \a N.
914/// \param N
915/// An immediate value. Bits [3:0] specify the bit offset in the result at
916/// which the lower 8 bits of \a I are written. \n
917/// 0000: Bits [7:0] of the result are used for insertion. \n
918/// 0001: Bits [15:8] of the result are used for insertion. \n
919/// 0010: Bits [23:16] of the result are used for insertion. \n
920/// 0011: Bits [31:24] of the result are used for insertion. \n
921/// 0100: Bits [39:32] of the result are used for insertion. \n
922/// 0101: Bits [47:40] of the result are used for insertion. \n
923/// 0110: Bits [55:48] of the result are used for insertion. \n
924/// 0111: Bits [63:56] of the result are used for insertion. \n
925/// 1000: Bits [71:64] of the result are used for insertion. \n
926/// 1001: Bits [79:72] of the result are used for insertion. \n
927/// 1010: Bits [87:80] of the result are used for insertion. \n
928/// 1011: Bits [95:88] of the result are used for insertion. \n
929/// 1100: Bits [103:96] of the result are used for insertion. \n
930/// 1101: Bits [111:104] of the result are used for insertion. \n
931/// 1110: Bits [119:112] of the result are used for insertion. \n
932/// 1111: Bits [127:120] of the result are used for insertion.
933/// \returns A 128-bit integer vector containing the constructed values.
934#define _mm_insert_epi8(X, I, N) \
935 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
936 (int)(N)))
937
938/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
939/// the 128-bit integer vector parameter, and then inserting the 32-bit
940/// integer parameter \a I at the offset specified by the immediate value
941/// parameter \a N.
942///
943/// \headerfile <x86intrin.h>
944///
945/// \code
946/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
947/// \endcode
948///
949/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
950///
951/// \param X
952/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
953/// result and then one of the four elements in the result vector is
954/// replaced by \a I.
955/// \param I
956/// A 32-bit integer that is written to the result beginning at the offset
957/// specified by \a N.
958/// \param N
959/// An immediate value. Bits [1:0] specify the bit offset in the result at
960/// which the integer \a I is written. \n
961/// 00: Bits [31:0] of the result are used for insertion. \n
962/// 01: Bits [63:32] of the result are used for insertion. \n
963/// 10: Bits [95:64] of the result are used for insertion. \n
964/// 11: Bits [127:96] of the result are used for insertion.
965/// \returns A 128-bit integer vector containing the constructed values.
966#define _mm_insert_epi32(X, I, N) \
967 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
968 (int)(N)))
969
970#ifdef __x86_64__
971/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
972/// the 128-bit integer vector parameter, and then inserting the 64-bit
973/// integer parameter \a I, using the immediate value parameter \a N as an
974/// insertion location selector.
975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
980/// \endcode
981///
982/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
983///
984/// \param X
985/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
986/// result and then one of the two elements in the result vector is replaced
987/// by \a I.
988/// \param I
989/// A 64-bit integer that is written to the result beginning at the offset
990/// specified by \a N.
991/// \param N
992/// An immediate value. Bit [0] specifies the bit offset in the result at
993/// which the integer \a I is written. \n
994/// 0: Bits [63:0] of the result are used for insertion. \n
995/// 1: Bits [127:64] of the result are used for insertion. \n
996/// \returns A 128-bit integer vector containing the constructed values.
997#define _mm_insert_epi64(X, I, N) \
998 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
999 (int)(N)))
1000#endif /* __x86_64__ */
1001
1002/* Extract int from packed integer array at index. This returns the element
1003 * as a zero extended value, so it is unsigned.
1004 */
1005/// Extracts an 8-bit element from the 128-bit integer vector of
1006/// [16 x i8], using the immediate value parameter \a N as a selector.
1007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// \code
1011/// int _mm_extract_epi8(__m128i X, const int N);
1012/// \endcode
1013///
1014/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1015///
1016/// \param X
1017/// A 128-bit integer vector.
1018/// \param N
1019/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1020/// the argument \a X to extract and copy to the result. \n
1021/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1022/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1023/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1024/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1025/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1026/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1027/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1028/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1029/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1030/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1031/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1032/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1033/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1034/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1035/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1036/// 1111: Bits [127:120] of the parameter \a X are extracted.
1037/// \returns An unsigned integer, whose lower 8 bits are selected from the
1038/// 128-bit integer vector parameter and the remaining bits are assigned
1039/// zeros.
1040#define _mm_extract_epi8(X, N) \
1041 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1042 (int)(N)))
1043
1044/// Extracts a 32-bit element from the 128-bit integer vector of
1045/// [4 x i32], using the immediate value parameter \a N as a selector.
1046///
1047/// \headerfile <x86intrin.h>
1048///
1049/// \code
1050/// int _mm_extract_epi32(__m128i X, const int N);
1051/// \endcode
1052///
1053/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1054///
1055/// \param X
1056/// A 128-bit integer vector.
1057/// \param N
1058/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1059/// the argument \a X to extract and copy to the result. \n
1060/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1061/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1062/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1063/// 11: Bits [127:96] of the parameter \a X are exracted.
1064/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1065/// integer vector parameter and the remaining bits are assigned zeros.
1066#define _mm_extract_epi32(X, N) \
1067 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1068
1069/// Extracts a 64-bit element from the 128-bit integer vector of
1070/// [2 x i64], using the immediate value parameter \a N as a selector.
1071///
1072/// \headerfile <x86intrin.h>
1073///
1074/// \code
1075/// long long _mm_extract_epi64(__m128i X, const int N);
1076/// \endcode
1077///
1078/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1079/// in 64-bit mode.
1080///
1081/// \param X
1082/// A 128-bit integer vector.
1083/// \param N
1084/// An immediate value. Bit [0] specifies which 64-bit vector element from
1085/// the argument \a X to return. \n
1086/// 0: Bits [63:0] are returned. \n
1087/// 1: Bits [127:64] are returned. \n
1088/// \returns A 64-bit integer.
1089#define _mm_extract_epi64(X, N) \
1090 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1091
1092/* SSE4 128-bit Packed Integer Comparisons. */
1093/// Tests whether the specified bits in a 128-bit integer vector are all
1094/// zeros.
1095///
1096/// \headerfile <x86intrin.h>
1097///
1098/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1099///
1100/// \param __M
1101/// A 128-bit integer vector containing the bits to be tested.
1102/// \param __V
1103/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1104/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1105static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1106 __m128i __V) {
1107 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1108}
1109
1110/// Tests whether the specified bits in a 128-bit integer vector are all
1111/// ones.
1112///
1113/// \headerfile <x86intrin.h>
1114///
1115/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1116///
1117/// \param __M
1118/// A 128-bit integer vector containing the bits to be tested.
1119/// \param __V
1120/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1121/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1122static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1123 __m128i __V) {
1124 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1125}
1126
1127/// Tests whether the specified bits in a 128-bit integer vector are
1128/// neither all zeros nor all ones.
1129///
1130/// \headerfile <x86intrin.h>
1131///
1132/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1133///
1134/// \param __M
1135/// A 128-bit integer vector containing the bits to be tested.
1136/// \param __V
1137/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1138/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1139/// FALSE otherwise.
1140static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1141 __m128i __V) {
1142 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1143}
1144
1145/// Tests whether the specified bits in a 128-bit integer vector are all
1146/// ones.
1147///
1148/// \headerfile <x86intrin.h>
1149///
1150/// \code
1151/// int _mm_test_all_ones(__m128i V);
1152/// \endcode
1153///
1154/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1155///
1156/// \param V
1157/// A 128-bit integer vector containing the bits to be tested.
1158/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1159/// otherwise.
1160#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1161
1162/// Tests whether the specified bits in a 128-bit integer vector are
1163/// neither all zeros nor all ones.
1164///
1165/// \headerfile <x86intrin.h>
1166///
1167/// \code
1168/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1169/// \endcode
1170///
1171/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1172///
1173/// \param M
1174/// A 128-bit integer vector containing the bits to be tested.
1175/// \param V
1176/// A 128-bit integer vector selecting which bits to test in operand \a M.
1177/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1178/// FALSE otherwise.
1179#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1180
1181/// Tests whether the specified bits in a 128-bit integer vector are all
1182/// zeros.
1183///
1184/// \headerfile <x86intrin.h>
1185///
1186/// \code
1187/// int _mm_test_all_zeros(__m128i M, __m128i V);
1188/// \endcode
1189///
1190/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1191///
1192/// \param M
1193/// A 128-bit integer vector containing the bits to be tested.
1194/// \param V
1195/// A 128-bit integer vector selecting which bits to test in operand \a M.
1196/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1197#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1198
1199/* SSE4 64-bit Packed Integer Comparisons. */
1200/// Compares each of the corresponding 64-bit values of the 128-bit
1201/// integer vectors for equality.
1202///
1203/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1204///
1205/// \headerfile <x86intrin.h>
1206///
1207/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1208///
1209/// \param __V1
1210/// A 128-bit integer vector.
1211/// \param __V2
1212/// A 128-bit integer vector.
1213/// \returns A 128-bit integer vector containing the comparison results.
1214static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1215_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) {
1216 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1217}
1218
1219/* SSE4 Packed Integer Sign-Extension. */
1220/// Sign-extends each of the lower eight 8-bit integer elements of a
1221/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1222/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1223/// are unused.
1224///
1225/// \headerfile <x86intrin.h>
1226///
1227/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1228///
1229/// \param __V
1230/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1231/// sign-extended to 16-bit values.
1232/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1233static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1234_mm_cvtepi8_epi16(__m128i __V) {
1235 /* This function always performs a signed extension, but __v16qi is a char
1236 which may be signed or unsigned, so use __v16qs. */
1237 return (__m128i) __builtin_convertvector(
1238 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1239 7),
1240 __v8hi);
1241}
1242
1243/// Sign-extends each of the lower four 8-bit integer elements of a
1244/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1245/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1246/// vector are unused.
1247///
1248/// \headerfile <x86intrin.h>
1249///
1250/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1251///
1252/// \param __V
1253/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1254/// sign-extended to 32-bit values.
1255/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1256static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1257_mm_cvtepi8_epi32(__m128i __V) {
1258 /* This function always performs a signed extension, but __v16qi is a char
1259 which may be signed or unsigned, so use __v16qs. */
1260 return (__m128i) __builtin_convertvector(
1261 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1262}
1263
1264/// Sign-extends each of the lower two 8-bit integer elements of a
1265/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1266/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1267/// vector are unused.
1268///
1269/// \headerfile <x86intrin.h>
1270///
1271/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1272///
1273/// \param __V
1274/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1275/// sign-extended to 64-bit values.
1276/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1277static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1278_mm_cvtepi8_epi64(__m128i __V) {
1279 /* This function always performs a signed extension, but __v16qi is a char
1280 which may be signed or unsigned, so use __v16qs. */
1281 return (__m128i) __builtin_convertvector(
1282 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1283}
1284
1285/// Sign-extends each of the lower four 16-bit integer elements of a
1286/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1287/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1288/// vector are unused.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1293///
1294/// \param __V
1295/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1296/// sign-extended to 32-bit values.
1297/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1298static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1300 return (__m128i) __builtin_convertvector(
1301 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1302}
1303
1304/// Sign-extends each of the lower two 16-bit integer elements of a
1305/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1306/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1307/// vector are unused.
1308///
1309/// \headerfile <x86intrin.h>
1310///
1311/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1312///
1313/// \param __V
1314/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1315/// sign-extended to 64-bit values.
1316/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1317static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1319 return (__m128i) __builtin_convertvector(
1320 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1321}
1322
1323/// Sign-extends each of the lower two 32-bit integer elements of a
1324/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1325/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1326/// are unused.
1327///
1328/// \headerfile <x86intrin.h>
1329///
1330/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1331///
1332/// \param __V
1333/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1334/// sign-extended to 64-bit values.
1335/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1336static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1338 return (__m128i) __builtin_convertvector(
1339 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1340}
1341
1342/* SSE4 Packed Integer Zero-Extension. */
1343/// Zero-extends each of the lower eight 8-bit integer elements of a
1344/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1345/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1346/// are unused.
1347///
1348/// \headerfile <x86intrin.h>
1349///
1350/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1351///
1352/// \param __V
1353/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1354/// zero-extended to 16-bit values.
1355/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1356static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1357_mm_cvtepu8_epi16(__m128i __V) {
1358 return (__m128i) __builtin_convertvector(
1359 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1360 7),
1361 __v8hi);
1362}
1363
1364/// Zero-extends each of the lower four 8-bit integer elements of a
1365/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1366/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1367/// vector are unused.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1372///
1373/// \param __V
1374/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1375/// zero-extended to 32-bit values.
1376/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1377static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1378_mm_cvtepu8_epi32(__m128i __V) {
1379 return (__m128i) __builtin_convertvector(
1380 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1381}
1382
1383/// Zero-extends each of the lower two 8-bit integer elements of a
1384/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1385/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1386/// vector are unused.
1387///
1388/// \headerfile <x86intrin.h>
1389///
1390/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1391///
1392/// \param __V
1393/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1394/// zero-extended to 64-bit values.
1395/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1396static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1397_mm_cvtepu8_epi64(__m128i __V) {
1398 return (__m128i) __builtin_convertvector(
1399 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1400}
1401
1402/// Zero-extends each of the lower four 16-bit integer elements of a
1403/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1404/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1405/// vector are unused.
1406///
1407/// \headerfile <x86intrin.h>
1408///
1409/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1410///
1411/// \param __V
1412/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1413/// zero-extended to 32-bit values.
1414/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1415static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1417 return (__m128i) __builtin_convertvector(
1418 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1419}
1420
1421/// Zero-extends each of the lower two 16-bit integer elements of a
1422/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1423/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1424/// are unused.
1425///
1426/// \headerfile <x86intrin.h>
1427///
1428/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1429///
1430/// \param __V
1431/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1432/// zero-extended to 64-bit values.
1433/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1434static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1436 return (__m128i) __builtin_convertvector(
1437 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1438}
1439
1440/// Zero-extends each of the lower two 32-bit integer elements of a
1441/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1442/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1443/// are unused.
1444///
1445/// \headerfile <x86intrin.h>
1446///
1447/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1448///
1449/// \param __V
1450/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1451/// zero-extended to 64-bit values.
1452/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1453static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1455 return (__m128i) __builtin_convertvector(
1456 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1457}
1458
1459/* SSE4 Pack with Unsigned Saturation. */
1460/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
1461/// vector operands into 16-bit unsigned integers, and returns the packed
1462/// result.
1463///
1464/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1465/// 0x0000 are saturated to 0x0000.
1466///
1467/// \headerfile <x86intrin.h>
1468///
1469/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1470///
1471/// \param __V1
1472/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1473/// written to the lower 64 bits of the result.
1474/// \param __V2
1475/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1476/// written to the higher 64 bits of the result.
1477/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1478static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
1479 __m128i __V2) {
1480 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1481}
1482
1483/* SSE4 Multiple Packed Sums of Absolute Difference. */
1484/// Subtracts 8-bit unsigned integer values and computes the absolute
1485/// values of the differences to the corresponding bits in the destination.
1486/// Then sums of the absolute differences are returned according to the bit
1487/// fields in the immediate operand.
1488///
1489/// \headerfile <x86intrin.h>
1490///
1491/// \code
1492/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1493/// \endcode
1494///
1495/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1496///
1497/// \param X
1498/// A 128-bit vector of [16 x i8].
1499/// \param Y
1500/// A 128-bit vector of [16 x i8].
1501/// \param M
1502/// An 8-bit immediate operand specifying how the absolute differences are to
1503/// be calculated, according to the following algorithm:
1504/// \code
1505/// // M2 represents bit 2 of the immediate operand
1506/// // M10 represents bits [1:0] of the immediate operand
1507/// i = M2 * 4;
1508/// j = M10 * 4;
1509/// for (k = 0; k < 8; k = k + 1) {
1510/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1511/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1512/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1513/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1514/// r[k] = d0 + d1 + d2 + d3;
1515/// }
1516/// \endcode
1517/// \returns A 128-bit integer vector containing the sums of the sets of
1518/// absolute differences between both operands.
1519#define _mm_mpsadbw_epu8(X, Y, M) \
1520 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1521 (__v16qi)(__m128i)(Y), (M)))
1522
1523/// Finds the minimum unsigned 16-bit element in the input 128-bit
1524/// vector of [8 x u16] and returns it and along with its index.
1525///
1526/// \headerfile <x86intrin.h>
1527///
1528/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1529/// instruction.
1530///
1531/// \param __V
1532/// A 128-bit vector of [8 x u16].
1533/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1534/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1535/// and the remaining bits are set to 0.
1536static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1537 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1538}
1539
1540/* Handle the sse4.2 definitions here. */
1541
1542/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1543 so we'll do the same. */
1544
1545#undef __DEFAULT_FN_ATTRS
1546#define __DEFAULT_FN_ATTRS \
1547 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1548
1549/* These specify the type of data that we're comparing. */
1550#define _SIDD_UBYTE_OPS 0x00
1551#define _SIDD_UWORD_OPS 0x01
1552#define _SIDD_SBYTE_OPS 0x02
1553#define _SIDD_SWORD_OPS 0x03
1554
1555/* These specify the type of comparison operation. */
1556#define _SIDD_CMP_EQUAL_ANY 0x00
1557#define _SIDD_CMP_RANGES 0x04
1558#define _SIDD_CMP_EQUAL_EACH 0x08
1559#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1560
1561/* These macros specify the polarity of the operation. */
1562#define _SIDD_POSITIVE_POLARITY 0x00
1563#define _SIDD_NEGATIVE_POLARITY 0x10
1564#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1565#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1566
1567/* These macros are used in _mm_cmpXstri() to specify the return. */
1568#define _SIDD_LEAST_SIGNIFICANT 0x00
1569#define _SIDD_MOST_SIGNIFICANT 0x40
1570
1571/* These macros are used in _mm_cmpXstri() to specify the return. */
1572#define _SIDD_BIT_MASK 0x00
1573#define _SIDD_UNIT_MASK 0x40
1574
1575/* SSE4.2 Packed Comparison Intrinsics. */
1576/// Uses the immediate operand \a M to perform a comparison of string
1577/// data with implicitly defined lengths that is contained in source operands
1578/// \a A and \a B. Returns a 128-bit integer vector representing the result
1579/// mask of the comparison.
1580///
1581/// \headerfile <x86intrin.h>
1582///
1583/// \code
1584/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1585/// \endcode
1586///
1587/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1588/// instruction.
1589///
1590/// \param A
1591/// A 128-bit integer vector containing one of the source operands to be
1592/// compared.
1593/// \param B
1594/// A 128-bit integer vector containing one of the source operands to be
1595/// compared.
1596/// \param M
1597/// An 8-bit immediate operand specifying whether the characters are bytes or
1598/// words, the type of comparison to perform, and the format of the return
1599/// value. \n
1600/// Bits [1:0]: Determine source data format. \n
1601/// 00: 16 unsigned bytes \n
1602/// 01: 8 unsigned words \n
1603/// 10: 16 signed bytes \n
1604/// 11: 8 signed words \n
1605/// Bits [3:2]: Determine comparison type and aggregation method. \n
1606/// 00: Subset: Each character in \a B is compared for equality with all
1607/// the characters in \a A. \n
1608/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1609/// basis is greater than or equal for even-indexed elements in \a A,
1610/// and less than or equal for odd-indexed elements in \a A. \n
1611/// 10: Match: Compare each pair of corresponding characters in \a A and
1612/// \a B for equality. \n
1613/// 11: Substring: Search \a B for substring matches of \a A. \n
1614/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1615/// mask of the comparison results. \n
1616/// 00: No effect. \n
1617/// 01: Negate the bit mask. \n
1618/// 10: No effect. \n
1619/// 11: Negate the bit mask only for bits with an index less than or equal
1620/// to the size of \a A or \a B. \n
1621/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1622/// bytes. \n
1623/// 0: The result is zero-extended to 16 bytes. \n
1624/// 1: The result is expanded to 16 bytes (this expansion is performed by
1625/// repeating each bit 8 or 16 times).
1626/// \returns Returns a 128-bit integer vector representing the result mask of
1627/// the comparison.
1628#define _mm_cmpistrm(A, B, M) \
1629 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1630 (__v16qi)(__m128i)(B), (int)(M)))
1631
1632/// Uses the immediate operand \a M to perform a comparison of string
1633/// data with implicitly defined lengths that is contained in source operands
1634/// \a A and \a B. Returns an integer representing the result index of the
1635/// comparison.
1636///
1637/// \headerfile <x86intrin.h>
1638///
1639/// \code
1640/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1641/// \endcode
1642///
1643/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1644/// instruction.
1645///
1646/// \param A
1647/// A 128-bit integer vector containing one of the source operands to be
1648/// compared.
1649/// \param B
1650/// A 128-bit integer vector containing one of the source operands to be
1651/// compared.
1652/// \param M
1653/// An 8-bit immediate operand specifying whether the characters are bytes or
1654/// words, the type of comparison to perform, and the format of the return
1655/// value. \n
1656/// Bits [1:0]: Determine source data format. \n
1657/// 00: 16 unsigned bytes \n
1658/// 01: 8 unsigned words \n
1659/// 10: 16 signed bytes \n
1660/// 11: 8 signed words \n
1661/// Bits [3:2]: Determine comparison type and aggregation method. \n
1662/// 00: Subset: Each character in \a B is compared for equality with all
1663/// the characters in \a A. \n
1664/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1665/// basis is greater than or equal for even-indexed elements in \a A,
1666/// and less than or equal for odd-indexed elements in \a A. \n
1667/// 10: Match: Compare each pair of corresponding characters in \a A and
1668/// \a B for equality. \n
1669/// 11: Substring: Search B for substring matches of \a A. \n
1670/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1671/// mask of the comparison results. \n
1672/// 00: No effect. \n
1673/// 01: Negate the bit mask. \n
1674/// 10: No effect. \n
1675/// 11: Negate the bit mask only for bits with an index less than or equal
1676/// to the size of \a A or \a B. \n
1677/// Bit [6]: Determines whether the index of the lowest set bit or the
1678/// highest set bit is returned. \n
1679/// 0: The index of the least significant set bit. \n
1680/// 1: The index of the most significant set bit. \n
1681/// \returns Returns an integer representing the result index of the comparison.
1682#define _mm_cmpistri(A, B, M) \
1683 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1684 (__v16qi)(__m128i)(B), (int)(M)))
1685
1686/// Uses the immediate operand \a M to perform a comparison of string
1687/// data with explicitly defined lengths that is contained in source operands
1688/// \a A and \a B. Returns a 128-bit integer vector representing the result
1689/// mask of the comparison.
1690///
1691/// \headerfile <x86intrin.h>
1692///
1693/// \code
1694/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1695/// \endcode
1696///
1697/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1698/// instruction.
1699///
1700/// \param A
1701/// A 128-bit integer vector containing one of the source operands to be
1702/// compared.
1703/// \param LA
1704/// An integer that specifies the length of the string in \a A.
1705/// \param B
1706/// A 128-bit integer vector containing one of the source operands to be
1707/// compared.
1708/// \param LB
1709/// An integer that specifies the length of the string in \a B.
1710/// \param M
1711/// An 8-bit immediate operand specifying whether the characters are bytes or
1712/// words, the type of comparison to perform, and the format of the return
1713/// value. \n
1714/// Bits [1:0]: Determine source data format. \n
1715/// 00: 16 unsigned bytes \n
1716/// 01: 8 unsigned words \n
1717/// 10: 16 signed bytes \n
1718/// 11: 8 signed words \n
1719/// Bits [3:2]: Determine comparison type and aggregation method. \n
1720/// 00: Subset: Each character in \a B is compared for equality with all
1721/// the characters in \a A. \n
1722/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1723/// basis is greater than or equal for even-indexed elements in \a A,
1724/// and less than or equal for odd-indexed elements in \a A. \n
1725/// 10: Match: Compare each pair of corresponding characters in \a A and
1726/// \a B for equality. \n
1727/// 11: Substring: Search \a B for substring matches of \a A. \n
1728/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1729/// mask of the comparison results. \n
1730/// 00: No effect. \n
1731/// 01: Negate the bit mask. \n
1732/// 10: No effect. \n
1733/// 11: Negate the bit mask only for bits with an index less than or equal
1734/// to the size of \a A or \a B. \n
1735/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1736/// bytes. \n
1737/// 0: The result is zero-extended to 16 bytes. \n
1738/// 1: The result is expanded to 16 bytes (this expansion is performed by
1739/// repeating each bit 8 or 16 times). \n
1740/// \returns Returns a 128-bit integer vector representing the result mask of
1741/// the comparison.
1742#define _mm_cmpestrm(A, LA, B, LB, M) \
1743 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1744 (__v16qi)(__m128i)(B), (int)(LB), \
1745 (int)(M)))
1746
1747/// Uses the immediate operand \a M to perform a comparison of string
1748/// data with explicitly defined lengths that is contained in source operands
1749/// \a A and \a B. Returns an integer representing the result index of the
1750/// comparison.
1751///
1752/// \headerfile <x86intrin.h>
1753///
1754/// \code
1755/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1756/// \endcode
1757///
1758/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1759/// instruction.
1760///
1761/// \param A
1762/// A 128-bit integer vector containing one of the source operands to be
1763/// compared.
1764/// \param LA
1765/// An integer that specifies the length of the string in \a A.
1766/// \param B
1767/// A 128-bit integer vector containing one of the source operands to be
1768/// compared.
1769/// \param LB
1770/// An integer that specifies the length of the string in \a B.
1771/// \param M
1772/// An 8-bit immediate operand specifying whether the characters are bytes or
1773/// words, the type of comparison to perform, and the format of the return
1774/// value. \n
1775/// Bits [1:0]: Determine source data format. \n
1776/// 00: 16 unsigned bytes \n
1777/// 01: 8 unsigned words \n
1778/// 10: 16 signed bytes \n
1779/// 11: 8 signed words \n
1780/// Bits [3:2]: Determine comparison type and aggregation method. \n
1781/// 00: Subset: Each character in \a B is compared for equality with all
1782/// the characters in \a A. \n
1783/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1784/// basis is greater than or equal for even-indexed elements in \a A,
1785/// and less than or equal for odd-indexed elements in \a A. \n
1786/// 10: Match: Compare each pair of corresponding characters in \a A and
1787/// \a B for equality. \n
1788/// 11: Substring: Search B for substring matches of \a A. \n
1789/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1790/// mask of the comparison results. \n
1791/// 00: No effect. \n
1792/// 01: Negate the bit mask. \n
1793/// 10: No effect. \n
1794/// 11: Negate the bit mask only for bits with an index less than or equal
1795/// to the size of \a A or \a B. \n
1796/// Bit [6]: Determines whether the index of the lowest set bit or the
1797/// highest set bit is returned. \n
1798/// 0: The index of the least significant set bit. \n
1799/// 1: The index of the most significant set bit. \n
1800/// \returns Returns an integer representing the result index of the comparison.
1801#define _mm_cmpestri(A, LA, B, LB, M) \
1802 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1803 (__v16qi)(__m128i)(B), (int)(LB), \
1804 (int)(M)))
1805
1806/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1807/// Uses the immediate operand \a M to perform a comparison of string
1808/// data with implicitly defined lengths that is contained in source operands
1809/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1810/// string in \a B is the maximum, otherwise, returns 0.
1811///
1812/// \headerfile <x86intrin.h>
1813///
1814/// \code
1815/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1816/// \endcode
1817///
1818/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1819/// instruction.
1820///
1821/// \param A
1822/// A 128-bit integer vector containing one of the source operands to be
1823/// compared.
1824/// \param B
1825/// A 128-bit integer vector containing one of the source operands to be
1826/// compared.
1827/// \param M
1828/// An 8-bit immediate operand specifying whether the characters are bytes or
1829/// words and the type of comparison to perform. \n
1830/// Bits [1:0]: Determine source data format. \n
1831/// 00: 16 unsigned bytes \n
1832/// 01: 8 unsigned words \n
1833/// 10: 16 signed bytes \n
1834/// 11: 8 signed words \n
1835/// Bits [3:2]: Determine comparison type and aggregation method. \n
1836/// 00: Subset: Each character in \a B is compared for equality with all
1837/// the characters in \a A. \n
1838/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1839/// basis is greater than or equal for even-indexed elements in \a A,
1840/// and less than or equal for odd-indexed elements in \a A. \n
1841/// 10: Match: Compare each pair of corresponding characters in \a A and
1842/// \a B for equality. \n
1843/// 11: Substring: Search \a B for substring matches of \a A. \n
1844/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1845/// mask of the comparison results. \n
1846/// 00: No effect. \n
1847/// 01: Negate the bit mask. \n
1848/// 10: No effect. \n
1849/// 11: Negate the bit mask only for bits with an index less than or equal
1850/// to the size of \a A or \a B. \n
1851/// \returns Returns 1 if the bit mask is zero and the length of the string in
1852/// \a B is the maximum; otherwise, returns 0.
1853#define _mm_cmpistra(A, B, M) \
1854 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1855 (__v16qi)(__m128i)(B), (int)(M)))
1856
1857/// Uses the immediate operand \a M to perform a comparison of string
1858/// data with implicitly defined lengths that is contained in source operands
1859/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1860/// 0.
1861///
1862/// \headerfile <x86intrin.h>
1863///
1864/// \code
1865/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1866/// \endcode
1867///
1868/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1869/// instruction.
1870///
1871/// \param A
1872/// A 128-bit integer vector containing one of the source operands to be
1873/// compared.
1874/// \param B
1875/// A 128-bit integer vector containing one of the source operands to be
1876/// compared.
1877/// \param M
1878/// An 8-bit immediate operand specifying whether the characters are bytes or
1879/// words and the type of comparison to perform. \n
1880/// Bits [1:0]: Determine source data format. \n
1881/// 00: 16 unsigned bytes \n
1882/// 01: 8 unsigned words \n
1883/// 10: 16 signed bytes \n
1884/// 11: 8 signed words \n
1885/// Bits [3:2]: Determine comparison type and aggregation method. \n
1886/// 00: Subset: Each character in \a B is compared for equality with all
1887/// the characters in \a A. \n
1888/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1889/// basis is greater than or equal for even-indexed elements in \a A,
1890/// and less than or equal for odd-indexed elements in \a A. \n
1891/// 10: Match: Compare each pair of corresponding characters in \a A and
1892/// \a B for equality. \n
1893/// 11: Substring: Search B for substring matches of \a A. \n
1894/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1895/// mask of the comparison results. \n
1896/// 00: No effect. \n
1897/// 01: Negate the bit mask. \n
1898/// 10: No effect. \n
1899/// 11: Negate the bit mask only for bits with an index less than or equal
1900/// to the size of \a A or \a B.
1901/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1902#define _mm_cmpistrc(A, B, M) \
1903 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1904 (__v16qi)(__m128i)(B), (int)(M)))
1905
1906/// Uses the immediate operand \a M to perform a comparison of string
1907/// data with implicitly defined lengths that is contained in source operands
1908/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1909///
1910/// \headerfile <x86intrin.h>
1911///
1912/// \code
1913/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1914/// \endcode
1915///
1916/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1917/// instruction.
1918///
1919/// \param A
1920/// A 128-bit integer vector containing one of the source operands to be
1921/// compared.
1922/// \param B
1923/// A 128-bit integer vector containing one of the source operands to be
1924/// compared.
1925/// \param M
1926/// An 8-bit immediate operand specifying whether the characters are bytes or
1927/// words and the type of comparison to perform. \n
1928/// Bits [1:0]: Determine source data format. \n
1929/// 00: 16 unsigned bytes \n
1930/// 01: 8 unsigned words \n
1931/// 10: 16 signed bytes \n
1932/// 11: 8 signed words \n
1933/// Bits [3:2]: Determine comparison type and aggregation method. \n
1934/// 00: Subset: Each character in \a B is compared for equality with all
1935/// the characters in \a A. \n
1936/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1937/// basis is greater than or equal for even-indexed elements in \a A,
1938/// and less than or equal for odd-indexed elements in \a A. \n
1939/// 10: Match: Compare each pair of corresponding characters in \a A and
1940/// \a B for equality. \n
1941/// 11: Substring: Search B for substring matches of \a A. \n
1942/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1943/// mask of the comparison results. \n
1944/// 00: No effect. \n
1945/// 01: Negate the bit mask. \n
1946/// 10: No effect. \n
1947/// 11: Negate the bit mask only for bits with an index less than or equal
1948/// to the size of \a A or \a B. \n
1949/// \returns Returns bit 0 of the resulting bit mask.
1950#define _mm_cmpistro(A, B, M) \
1951 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1952 (__v16qi)(__m128i)(B), (int)(M)))
1953
1954/// Uses the immediate operand \a M to perform a comparison of string
1955/// data with implicitly defined lengths that is contained in source operands
1956/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1957/// the maximum, otherwise, returns 0.
1958///
1959/// \headerfile <x86intrin.h>
1960///
1961/// \code
1962/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1963/// \endcode
1964///
1965/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1966/// instruction.
1967///
1968/// \param A
1969/// A 128-bit integer vector containing one of the source operands to be
1970/// compared.
1971/// \param B
1972/// A 128-bit integer vector containing one of the source operands to be
1973/// compared.
1974/// \param M
1975/// An 8-bit immediate operand specifying whether the characters are bytes or
1976/// words and the type of comparison to perform. \n
1977/// Bits [1:0]: Determine source data format. \n
1978/// 00: 16 unsigned bytes \n
1979/// 01: 8 unsigned words \n
1980/// 10: 16 signed bytes \n
1981/// 11: 8 signed words \n
1982/// Bits [3:2]: Determine comparison type and aggregation method. \n
1983/// 00: Subset: Each character in \a B is compared for equality with all
1984/// the characters in \a A. \n
1985/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1986/// basis is greater than or equal for even-indexed elements in \a A,
1987/// and less than or equal for odd-indexed elements in \a A. \n
1988/// 10: Match: Compare each pair of corresponding characters in \a A and
1989/// \a B for equality. \n
1990/// 11: Substring: Search \a B for substring matches of \a A. \n
1991/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1992/// mask of the comparison results. \n
1993/// 00: No effect. \n
1994/// 01: Negate the bit mask. \n
1995/// 10: No effect. \n
1996/// 11: Negate the bit mask only for bits with an index less than or equal
1997/// to the size of \a A or \a B. \n
1998/// \returns Returns 1 if the length of the string in \a A is less than the
1999/// maximum, otherwise, returns 0.
2000#define _mm_cmpistrs(A, B, M) \
2001 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
2002 (__v16qi)(__m128i)(B), (int)(M)))
2003
2004/// Uses the immediate operand \a M to perform a comparison of string
2005/// data with implicitly defined lengths that is contained in source operands
2006/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2007/// the maximum, otherwise, returns 0.
2008///
2009/// \headerfile <x86intrin.h>
2010///
2011/// \code
2012/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2013/// \endcode
2014///
2015/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2016/// instruction.
2017///
2018/// \param A
2019/// A 128-bit integer vector containing one of the source operands to be
2020/// compared.
2021/// \param B
2022/// A 128-bit integer vector containing one of the source operands to be
2023/// compared.
2024/// \param M
2025/// An 8-bit immediate operand specifying whether the characters are bytes or
2026/// words and the type of comparison to perform. \n
2027/// Bits [1:0]: Determine source data format. \n
2028/// 00: 16 unsigned bytes \n
2029/// 01: 8 unsigned words \n
2030/// 10: 16 signed bytes \n
2031/// 11: 8 signed words \n
2032/// Bits [3:2]: Determine comparison type and aggregation method. \n
2033/// 00: Subset: Each character in \a B is compared for equality with all
2034/// the characters in \a A. \n
2035/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2036/// basis is greater than or equal for even-indexed elements in \a A,
2037/// and less than or equal for odd-indexed elements in \a A. \n
2038/// 10: Match: Compare each pair of corresponding characters in \a A and
2039/// \a B for equality. \n
2040/// 11: Substring: Search \a B for substring matches of \a A. \n
2041/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2042/// mask of the comparison results. \n
2043/// 00: No effect. \n
2044/// 01: Negate the bit mask. \n
2045/// 10: No effect. \n
2046/// 11: Negate the bit mask only for bits with an index less than or equal
2047/// to the size of \a A or \a B.
2048/// \returns Returns 1 if the length of the string in \a B is less than the
2049/// maximum, otherwise, returns 0.
2050#define _mm_cmpistrz(A, B, M) \
2051 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2052 (__v16qi)(__m128i)(B), (int)(M)))
2053
2054/// Uses the immediate operand \a M to perform a comparison of string
2055/// data with explicitly defined lengths that is contained in source operands
2056/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2057/// string in \a B is the maximum, otherwise, returns 0.
2058///
2059/// \headerfile <x86intrin.h>
2060///
2061/// \code
2062/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2063/// \endcode
2064///
2065/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2066/// instruction.
2067///
2068/// \param A
2069/// A 128-bit integer vector containing one of the source operands to be
2070/// compared.
2071/// \param LA
2072/// An integer that specifies the length of the string in \a A.
2073/// \param B
2074/// A 128-bit integer vector containing one of the source operands to be
2075/// compared.
2076/// \param LB
2077/// An integer that specifies the length of the string in \a B.
2078/// \param M
2079/// An 8-bit immediate operand specifying whether the characters are bytes or
2080/// words and the type of comparison to perform. \n
2081/// Bits [1:0]: Determine source data format. \n
2082/// 00: 16 unsigned bytes \n
2083/// 01: 8 unsigned words \n
2084/// 10: 16 signed bytes \n
2085/// 11: 8 signed words \n
2086/// Bits [3:2]: Determine comparison type and aggregation method. \n
2087/// 00: Subset: Each character in \a B is compared for equality with all
2088/// the characters in \a A. \n
2089/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2090/// basis is greater than or equal for even-indexed elements in \a A,
2091/// and less than or equal for odd-indexed elements in \a A. \n
2092/// 10: Match: Compare each pair of corresponding characters in \a A and
2093/// \a B for equality. \n
2094/// 11: Substring: Search \a B for substring matches of \a A. \n
2095/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2096/// mask of the comparison results. \n
2097/// 00: No effect. \n
2098/// 01: Negate the bit mask. \n
2099/// 10: No effect. \n
2100/// 11: Negate the bit mask only for bits with an index less than or equal
2101/// to the size of \a A or \a B.
2102/// \returns Returns 1 if the bit mask is zero and the length of the string in
2103/// \a B is the maximum, otherwise, returns 0.
2104#define _mm_cmpestra(A, LA, B, LB, M) \
2105 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2106 (__v16qi)(__m128i)(B), (int)(LB), \
2107 (int)(M)))
2108
2109/// Uses the immediate operand \a M to perform a comparison of string
2110/// data with explicitly defined lengths that is contained in source operands
2111/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2112/// returns 0.
2113///
2114/// \headerfile <x86intrin.h>
2115///
2116/// \code
2117/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2118/// \endcode
2119///
2120/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2121/// instruction.
2122///
2123/// \param A
2124/// A 128-bit integer vector containing one of the source operands to be
2125/// compared.
2126/// \param LA
2127/// An integer that specifies the length of the string in \a A.
2128/// \param B
2129/// A 128-bit integer vector containing one of the source operands to be
2130/// compared.
2131/// \param LB
2132/// An integer that specifies the length of the string in \a B.
2133/// \param M
2134/// An 8-bit immediate operand specifying whether the characters are bytes or
2135/// words and the type of comparison to perform. \n
2136/// Bits [1:0]: Determine source data format. \n
2137/// 00: 16 unsigned bytes \n
2138/// 01: 8 unsigned words \n
2139/// 10: 16 signed bytes \n
2140/// 11: 8 signed words \n
2141/// Bits [3:2]: Determine comparison type and aggregation method. \n
2142/// 00: Subset: Each character in \a B is compared for equality with all
2143/// the characters in \a A. \n
2144/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2145/// basis is greater than or equal for even-indexed elements in \a A,
2146/// and less than or equal for odd-indexed elements in \a A. \n
2147/// 10: Match: Compare each pair of corresponding characters in \a A and
2148/// \a B for equality. \n
2149/// 11: Substring: Search \a B for substring matches of \a A. \n
2150/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2151/// mask of the comparison results. \n
2152/// 00: No effect. \n
2153/// 01: Negate the bit mask. \n
2154/// 10: No effect. \n
2155/// 11: Negate the bit mask only for bits with an index less than or equal
2156/// to the size of \a A or \a B. \n
2157/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2158#define _mm_cmpestrc(A, LA, B, LB, M) \
2159 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2160 (__v16qi)(__m128i)(B), (int)(LB), \
2161 (int)(M)))
2162
2163/// Uses the immediate operand \a M to perform a comparison of string
2164/// data with explicitly defined lengths that is contained in source operands
2165/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2166///
2167/// \headerfile <x86intrin.h>
2168///
2169/// \code
2170/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2171/// \endcode
2172///
2173/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2174/// instruction.
2175///
2176/// \param A
2177/// A 128-bit integer vector containing one of the source operands to be
2178/// compared.
2179/// \param LA
2180/// An integer that specifies the length of the string in \a A.
2181/// \param B
2182/// A 128-bit integer vector containing one of the source operands to be
2183/// compared.
2184/// \param LB
2185/// An integer that specifies the length of the string in \a B.
2186/// \param M
2187/// An 8-bit immediate operand specifying whether the characters are bytes or
2188/// words and the type of comparison to perform. \n
2189/// Bits [1:0]: Determine source data format. \n
2190/// 00: 16 unsigned bytes \n
2191/// 01: 8 unsigned words \n
2192/// 10: 16 signed bytes \n
2193/// 11: 8 signed words \n
2194/// Bits [3:2]: Determine comparison type and aggregation method. \n
2195/// 00: Subset: Each character in \a B is compared for equality with all
2196/// the characters in \a A. \n
2197/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2198/// basis is greater than or equal for even-indexed elements in \a A,
2199/// and less than or equal for odd-indexed elements in \a A. \n
2200/// 10: Match: Compare each pair of corresponding characters in \a A and
2201/// \a B for equality. \n
2202/// 11: Substring: Search \a B for substring matches of \a A. \n
2203/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2204/// mask of the comparison results. \n
2205/// 00: No effect. \n
2206/// 01: Negate the bit mask. \n
2207/// 10: No effect. \n
2208/// 11: Negate the bit mask only for bits with an index less than or equal
2209/// to the size of \a A or \a B.
2210/// \returns Returns bit 0 of the resulting bit mask.
2211#define _mm_cmpestro(A, LA, B, LB, M) \
2212 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2213 (__v16qi)(__m128i)(B), (int)(LB), \
2214 (int)(M)))
2215
2216/// Uses the immediate operand \a M to perform a comparison of string
2217/// data with explicitly defined lengths that is contained in source operands
2218/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2219/// the maximum, otherwise, returns 0.
2220///
2221/// \headerfile <x86intrin.h>
2222///
2223/// \code
2224/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2225/// \endcode
2226///
2227/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2228/// instruction.
2229///
2230/// \param A
2231/// A 128-bit integer vector containing one of the source operands to be
2232/// compared.
2233/// \param LA
2234/// An integer that specifies the length of the string in \a A.
2235/// \param B
2236/// A 128-bit integer vector containing one of the source operands to be
2237/// compared.
2238/// \param LB
2239/// An integer that specifies the length of the string in \a B.
2240/// \param M
2241/// An 8-bit immediate operand specifying whether the characters are bytes or
2242/// words and the type of comparison to perform. \n
2243/// Bits [1:0]: Determine source data format. \n
2244/// 00: 16 unsigned bytes \n
2245/// 01: 8 unsigned words \n
2246/// 10: 16 signed bytes \n
2247/// 11: 8 signed words \n
2248/// Bits [3:2]: Determine comparison type and aggregation method. \n
2249/// 00: Subset: Each character in \a B is compared for equality with all
2250/// the characters in \a A. \n
2251/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2252/// basis is greater than or equal for even-indexed elements in \a A,
2253/// and less than or equal for odd-indexed elements in \a A. \n
2254/// 10: Match: Compare each pair of corresponding characters in \a A and
2255/// \a B for equality. \n
2256/// 11: Substring: Search \a B for substring matches of \a A. \n
2257/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2258/// mask of the comparison results. \n
2259/// 00: No effect. \n
2260/// 01: Negate the bit mask. \n
2261/// 10: No effect. \n
2262/// 11: Negate the bit mask only for bits with an index less than or equal
2263/// to the size of \a A or \a B. \n
2264/// \returns Returns 1 if the length of the string in \a A is less than the
2265/// maximum, otherwise, returns 0.
2266#define _mm_cmpestrs(A, LA, B, LB, M) \
2267 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2268 (__v16qi)(__m128i)(B), (int)(LB), \
2269 (int)(M)))
2270
2271/// Uses the immediate operand \a M to perform a comparison of string
2272/// data with explicitly defined lengths that is contained in source operands
2273/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2274/// the maximum, otherwise, returns 0.
2275///
2276/// \headerfile <x86intrin.h>
2277///
2278/// \code
2279/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2280/// \endcode
2281///
2282/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2283///
2284/// \param A
2285/// A 128-bit integer vector containing one of the source operands to be
2286/// compared.
2287/// \param LA
2288/// An integer that specifies the length of the string in \a A.
2289/// \param B
2290/// A 128-bit integer vector containing one of the source operands to be
2291/// compared.
2292/// \param LB
2293/// An integer that specifies the length of the string in \a B.
2294/// \param M
2295/// An 8-bit immediate operand specifying whether the characters are bytes or
2296/// words and the type of comparison to perform. \n
2297/// Bits [1:0]: Determine source data format. \n
2298/// 00: 16 unsigned bytes \n
2299/// 01: 8 unsigned words \n
2300/// 10: 16 signed bytes \n
2301/// 11: 8 signed words \n
2302/// Bits [3:2]: Determine comparison type and aggregation method. \n
2303/// 00: Subset: Each character in \a B is compared for equality with all
2304/// the characters in \a A. \n
2305/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2306/// basis is greater than or equal for even-indexed elements in \a A,
2307/// and less than or equal for odd-indexed elements in \a A. \n
2308/// 10: Match: Compare each pair of corresponding characters in \a A and
2309/// \a B for equality. \n
2310/// 11: Substring: Search \a B for substring matches of \a A. \n
2311/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2312/// mask of the comparison results. \n
2313/// 00: No effect. \n
2314/// 01: Negate the bit mask. \n
2315/// 10: No effect. \n
2316/// 11: Negate the bit mask only for bits with an index less than or equal
2317/// to the size of \a A or \a B.
2318/// \returns Returns 1 if the length of the string in \a B is less than the
2319/// maximum, otherwise, returns 0.
2320#define _mm_cmpestrz(A, LA, B, LB, M) \
2321 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2322 (__v16qi)(__m128i)(B), (int)(LB), \
2323 (int)(M)))
2324
2325/* SSE4.2 Compare Packed Data -- Greater Than. */
2326/// Compares each of the corresponding 64-bit values of the 128-bit
2327/// integer vectors to determine if the values in the first operand are
2328/// greater than those in the second operand.
2329///
2330/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
2331///
2332/// \headerfile <x86intrin.h>
2333///
2334/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2335///
2336/// \param __V1
2337/// A 128-bit integer vector.
2338/// \param __V2
2339/// A 128-bit integer vector.
2340/// \returns A 128-bit integer vector containing the comparison results.
2341static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2342_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) {
2343 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2344}
2345
2346#undef __DEFAULT_FN_ATTRS
2347#undef __DEFAULT_FN_ATTRS_CONSTEXPR
2348
2349#include <popcntintrin.h>
2350
2351#include <crc32intrin.h>
2352
2353#endif /* __SMMINTRIN_H */
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:448
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition: smmintrin.h:571
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:714
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128(const void *__V)
Loads integer values from a 128-bit aligned memory location to a 128-bit integer vector.
Definition: smmintrin.h:660
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition: smmintrin.h:2342
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:678
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition: smmintrin.h:1215
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:804
#define __DEFAULT_FN_ATTRS
Definition: smmintrin.h:1546
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:475
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1357
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1435
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: smmintrin.h:1478
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:750
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1454
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1257
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1318
#define __DEFAULT_FN_ATTRS_CONSTEXPR
Definition: smmintrin.h:33
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1378
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1278
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:696
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:732
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition: smmintrin.h:1140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:768
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition: smmintrin.h:1122
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition: smmintrin.h:552
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1299
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1337
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1416
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:786
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1234
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1397
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition: smmintrin.h:1536
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition: smmintrin.h:1105
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition: smmintrin.h:502