20 #ifndef SSE_INTRINSICS_H
21 #define SSE_INTRINSICS_H
23 #include "../common/windows_fix_intrin.h"
33 #include <xmmintrin.h>
35 #include <emmintrin.h>
38 #include "../common/fix_clang_emmintrin.h"
57 #if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT)
58 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__(
"psllw %1,%0" :
"+x"(a) :
"x"(count));
return a; }
59 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__(
"pslld %1,%0" :
"+x"(a) :
"x"(count));
return a; }
60 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__(
"psllq %1,%0" :
"+x"(a) :
"x"(count));
return a; }
61 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__(
"psrlw %1,%0" :
"+x"(a) :
"x"(count));
return a; }
62 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__(
"psrld %1,%0" :
"+x"(a) :
"x"(count));
return a; }
63 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__(
"psrlq %1,%0" :
"+x"(a) :
"x"(count));
return a; }
69 static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) {
return static_cast<__m128d
>(
static_cast<__v2df
>(
a) * static_cast<__v2df>(b)); }
70 static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) {
return static_cast<__m128d
>(
static_cast<__v2df
>(
a) + static_cast<__v2df>(b)); }
71 static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) {
return static_cast<__m128d
>(
static_cast<__v2df
>(
a) - static_cast<__v2df>(b)); }
72 static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) {
return static_cast<__m128
>(
static_cast<__v4sf
>(
a) * static_cast<__v4sf>(b)); }
73 static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) {
return static_cast<__m128
>(
static_cast<__v4sf
>(
a) + static_cast<__v4sf>(b)); }
74 static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) {
return static_cast<__m128
>(
static_cast<__v4sf
>(
a) - static_cast<__v4sf>(b)); }
77 #if defined(VC_GNU_ASM) && !defined(NVALGRIND)
124 #include <pmmintrin.h>
130 #include <tmmintrin.h>
132 #define mm_abs_epi8 _mm_abs_epi8
133 #define mm_abs_epi16 _mm_abs_epi16
134 #define mm_abs_epi32 _mm_abs_epi32
135 #define mm_alignr_epi8 _mm_alignr_epi8
144 #if defined(VC_GCC) && VC_GCC < 0x40500
145 return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128());
148 return _mm_set1_epi8(a);
162 __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
163 return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative,
_mm_setone_epi8()));
176 __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
177 return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
180 __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
181 return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
184 return _mm_set1_epi8(a);
189 case 1:
return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
190 case 2:
return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
191 case 3:
return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
192 case 4:
return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
193 case 5:
return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
194 case 6:
return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
195 case 7:
return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
196 case 8:
return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
197 case 9:
return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
198 case 10:
return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
199 case 11:
return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
200 case 12:
return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
201 case 13:
return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
202 case 14:
return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
203 case 15:
return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
205 case 17:
return _mm_srli_si128(a, 1);
206 case 18:
return _mm_srli_si128(a, 2);
207 case 19:
return _mm_srli_si128(a, 3);
208 case 20:
return _mm_srli_si128(a, 4);
209 case 21:
return _mm_srli_si128(a, 5);
210 case 22:
return _mm_srli_si128(a, 6);
211 case 23:
return _mm_srli_si128(a, 7);
212 case 24:
return _mm_srli_si128(a, 8);
213 case 25:
return _mm_srli_si128(a, 9);
214 case 26:
return _mm_srli_si128(a, 10);
215 case 27:
return _mm_srli_si128(a, 11);
216 case 28:
return _mm_srli_si128(a, 12);
217 case 29:
return _mm_srli_si128(a, 13);
218 case 30:
return _mm_srli_si128(a, 14);
219 case 31:
return _mm_srli_si128(a, 15);
221 return _mm_setzero_si128();
231 #ifdef VC_IMPL_SSE4_1
233 #include <smmintrin.h>
240 #define mm_blendv_pd _mm_blendv_pd
241 #define mm_blendv_ps _mm_blendv_ps
242 #define mm_blendv_epi8 _mm_blendv_epi8
243 #define mm_blend_epi16 _mm_blend_epi16
244 #define mm_blend_ps _mm_blend_ps
245 #define mm_blend_pd _mm_blend_pd
247 #define mm_min_epi32 _mm_min_epi32
248 #define mm_max_epi32 _mm_max_epi32
249 #define mm_min_epu32 _mm_min_epu32
250 #define mm_max_epu32 _mm_max_epu32
253 #define mm_min_epu16 _mm_min_epu16
254 #define mm_max_epu16 _mm_max_epu16
255 #define mm_min_epi8 _mm_min_epi8
256 #define mm_max_epi8 _mm_max_epi8
258 #define mm_cvtepu16_epi32 _mm_cvtepu16_epi32
259 #define mm_cvtepu8_epi16 _mm_cvtepu8_epi16
260 #define mm_cvtepi8_epi16 _mm_cvtepi8_epi16
261 #define mm_cvtepu16_epi32 _mm_cvtepu16_epi32
262 #define mm_cvtepi16_epi32 _mm_cvtepi16_epi32
263 #define mm_cvtepu8_epi32 _mm_cvtepu8_epi32
264 #define mm_cvtepi8_epi32 _mm_cvtepi8_epi32
265 #define mm_stream_load_si128 _mm_stream_load_si128
277 return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
280 return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
283 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
293 return _mm_shuffle_pd(b, a, 2);
295 return _mm_shuffle_pd(a, b, 2);
321 c = _mm_set_epi32(0, -1, 0, -1);
333 c = _mm_set_epi32(-1, 0, 0, -1);
336 c = _mm_set_epi32(-1, 0, -1, 0);
339 c = _mm_set_epi32(-1, 0, -1, -1);
345 c = _mm_set_epi32(-1, -1, 0, -1);
354 c = _mm_setzero_si128();
357 __m128 _c = _mm_castsi128_ps(c);
358 return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
375 return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
409 return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
411 return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
413 const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
414 c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
417 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
451 return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
454 return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
457 return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
460 return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
466 const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
467 const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
468 return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
471 return _mm_load_si128(mem);
479 #ifdef VC_IMPL_POPCNT
480 #include <popcntintrin.h>
484 #ifdef VC_IMPL_SSE4_2
486 #include <nmmintrin.h>
499 f = _mm_cvtss_f32(v);
501 #if defined VC_IMPL_SSE4_1 && !defined VC_MSVC
504 f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i));
507 _MM_EXTRACT_FLOAT(f, v, i);
512 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4)));
515 f = _mm_cvtss_f32(_mm_movehl_ps(v, v));
518 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12)));
526 return _mm_cvtsd_f64(v);
528 return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v))));
532 if (__builtin_constant_p(i)) {
542 const float4 &data =
reinterpret_cast<const float4 &
>(
v);
546 union { __m128
v;
float m[4]; } u;
553 #ifdef VC_IMPL_SSE4_1
554 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
556 return _mm_load_ps(mem);
560 #ifdef VC_IMPL_SSE4_1
561 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
563 return _mm_load_pd(mem);
567 #ifdef VC_IMPL_SSE4_1
568 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
570 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
593 #if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)
595 #include <x86intrin.h>
602 #endif // SSE_INTRINSICS_H
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi16_epi32(__m128i epu16)
static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i)
Namespace for new ROOT classes and functions.
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi8(__m128i a, __m128i b)
static Vc_INTRINSIC __m128d mm_blend_pd(__m128d a, __m128d b, const int mask)
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16()
static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd()
static Vc_INTRINSIC __m128 mm_blend_ps(__m128 a, __m128 b, const int mask)
static Vc_INTRINSIC __m128i mm_blend_epi16(__m128i a, __m128i b, const int mask)
static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps()
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8()
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu32(__m128i a, __m128i b)
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi32(__m128i epi8)
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi32(__m128i epu8)
static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128()
static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi8(__m128i a)
static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps()
static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi32(__m128i a)
static Vc_INTRINSIC __m128i mm_blendv_epi8(__m128i a, __m128i b, __m128i c)
static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd()
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu16(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi16(__m128i a)
static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps()
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16()
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi8(__m128i a, __m128i b)
static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i)
static Vc_INTRINSIC Vc_PURE __m128i mm_stream_load_si128(__m128i *mem)
static Vc_INTRINSIC __m128d mm_blendv_pd(__m128d a, __m128d b, __m128d c)
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi16(__m128i epu8)
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu16(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8()
static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem)
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi32(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32()
static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone()
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu32(__m128i a, __m128i b)
static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd()
static Vc_INTRINSIC __m128 mm_blendv_ps(__m128 a, __m128 b, __m128 c)
static Vc_INTRINSIC __m128i Vc_CONST mm_alignr_epi8(__m128i a, __m128i b, const int s)
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32()
static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i)
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi16(__m128i epi8)
static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a)
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi32(__m128i a, __m128i b)
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b)
static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd()
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32()
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu16_epi32(__m128i epu16)
static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps()
static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16()