ROOT  6.06/09
Reference Guide
intrinsics.h
Go to the documentation of this file.
1 /* This file is part of the Vc library.
2 
3  Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4 
5  Vc is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as
7  published by the Free Software Foundation, either version 3 of
8  the License, or (at your option) any later version.
9 
10  Vc is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 
18 */
19 
20 #ifndef VC_AVX_INTRINSICS_H
21 #define VC_AVX_INTRINSICS_H
22 
23 #include "../common/windows_fix_intrin.h"
24 
25 #include <Vc/global.h>
26 
27 // see comment in sse/intrinsics.h
28 extern "C" {
29 // AVX
30 #include <immintrin.h>
31 
32 #if (defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)) && !defined(VC_MSVC)
33 #include <x86intrin.h>
34 #endif
35 }
36 
37 #include "../common/fix_clang_emmintrin.h"
38 
39 #if defined(VC_CLANG) && VC_CLANG < 0x30100
40 // _mm_permute_ps is broken: http://llvm.org/bugs/show_bug.cgi?id=12401
41 #undef _mm_permute_ps
42 #define _mm_permute_ps(A, C) __extension__ ({ \
43  m128 __A = (A); \
44  (m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
45  (C) & 0x3, ((C) & 0xc) >> 2, \
46  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
47 #endif
48 
49 #include "const_data.h"
50 #include "macros.h"
51 #include <cstdlib>
52 
53 #if defined(VC_CLANG) || defined(VC_MSVC) || (defined(VC_GCC) && !defined(__OPTIMIZE__))
54 #define VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT
55 #endif
56 
57 #if defined(VC_CLANG) && VC_CLANG <= 0x30000
58 // _mm_alignr_epi8 doesn't specify its return type, thus breaking overload resolution
59 #undef _mm_alignr_epi8
60 #define _mm_alignr_epi8(a, b, n) ((m128i)__builtin_ia32_palignr128((a), (b), (n)))
61 #endif
62 
63 namespace ROOT {
64 namespace Vc
65 {
66 namespace AVX
67 {
68  /* super evil hacking around C++ features:
69  * consider
70  * void fun(int);
71  * namespace X { void fun(int); }
72  * namespace X { void bar() { fun(0); } } // this will be a call to X::fun(int)
73  *
74  * void fun(m256);
75  * namespace X { void fun(m256); }
76  * namespace X { void bar() { fun(0); } } // this will be ambiguous because m256 is a
77  * non-fundamental type in the global namespace, thus
78  * adding ::fun(m256) to the candidates
79  *
80  * To make my own overloads of the intrinsics distinct I have to use a type that is inside the
81  * Vc::AVX namespace. To reduce porting effort and increase generality I want to use the same
82  * function names as used in the global namespace. The type name may not be the same, though
83  * because identifiers starting with two underscores are reserved by the standard. Thus using
84  * those would mean to depend on undefined behavior.
85  * Sadly a typedef is not enough.
86  * Public inheritance also does not work, because at least ICC considers the __m??? types to be
87  * some sort of fundamental types.
88  * Thus composition is the only solution.
89  */
90 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
91  template<typename T> struct Alias
92  {
93  typedef T Base;
94  T _d;
95  Vc_ALWAYS_INLINE operator T &() { return _d; }
96  Vc_ALWAYS_INLINE operator const T &() const { return _d; }
97  Vc_ALWAYS_INLINE Alias() {}
98  Vc_ALWAYS_INLINE Alias(T x) : _d(x) {}
99  Vc_ALWAYS_INLINE Alias(const Alias &x) : _d(x._d) {}
100  Vc_ALWAYS_INLINE Alias &operator=(T x) { _d = x; return *this; }
101  Vc_ALWAYS_INLINE Alias &operator=(const Alias &x) { _d = x._d; return *this; }
102  };
103  typedef Alias<__m128 > m128 ;
104  typedef Alias<__m128d> m128d;
105  typedef Alias<__m128i> m128i;
106  typedef Alias<__m256 > m256 ;
107  typedef Alias<__m256d> m256d;
108  typedef Alias<__m256i> m256i;
109 #else
110  typedef __m128 m128 ;
111  typedef __m128d m128d;
112  typedef __m128i m128i;
113  typedef __m256 m256 ;
114  typedef __m256d m256d;
115  typedef __m256i m256i;
116 #endif
117 #if defined(VC_UNCONDITIONAL_AVX2_INTRINSICS) && defined(VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN)
118  typedef const m128 & param128 ;
119  typedef const m128d & param128d;
120  typedef const m128i & param128i;
121  typedef const m256 & param256 ;
122  typedef const m256d & param256d;
123  typedef const m256i & param256i;
124 #else
125  typedef const m128 param128 ;
126  typedef const m128d param128d;
127  typedef const m128i param128i;
128  typedef const m256 param256 ;
129  typedef const m256d param256d;
130  typedef const m256i param256i;
131 #endif
132 
133 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
134  // Make use of cast intrinsics easier. But if param256 == const __m256 then these would lead to
135  // ambiguities.
136  static Vc_INTRINSIC m256i Vc_CONST _mm256_castps_si256(param256 a) { return ::_mm256_castps_si256(a); }
137  static Vc_INTRINSIC m256d Vc_CONST _mm256_castps_pd (param256 a) { return ::_mm256_castps_pd (a); }
138  static Vc_INTRINSIC m256i Vc_CONST _mm256_castpd_si256(param256d a) { return ::_mm256_castpd_si256(a); }
139  static Vc_INTRINSIC m256 Vc_CONST _mm256_castpd_ps (param256d a) { return ::_mm256_castpd_ps (a); }
140  static Vc_INTRINSIC m256 Vc_CONST _mm256_castsi256_ps(param256i a) { return ::_mm256_castsi256_ps(a); }
141  static Vc_INTRINSIC m256d Vc_CONST _mm256_castsi256_pd(param256i a) { return ::_mm256_castsi256_pd(a); }
142 #endif
143 
144 #ifdef VC_GCC
145  // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
146  // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
147  static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
148  static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
149  static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
150  static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
151  static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
152  static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
153 #endif
154 
158  //static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epu32(unsigned int a) { return ::_mm256_set1_epu32(a); }
159 
160 #if defined(VC_GNU_ASM) && !defined(NVALGRIND)
161  static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; }
162 #else
163  static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); }
164 #endif
166  static Vc_INTRINSIC m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); }
167  static Vc_INTRINSIC m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); }
168 
169  static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); }
170  static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); }
171  static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
173  static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
174 
175 #if defined(VC_GNU_ASM) && !defined(NVALGRIND)
176  static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { __m256 r; __asm__("vcmpps $8,%0,%0,%0":"=x"(r)); return r; }
177 #elif defined(VC_MSVC)
178  // MSVC puts temporaries of this value on the stack, but sometimes at misaligned addresses, try
179  // some other generator instead...
180  static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { return _mm256_castsi256_ps(_mm256_set1_epi32(-1)); }
181 #else
182  static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { m256 r = _mm256_setzero_ps(); return _mm256_cmp_ps(r, r, _CMP_EQ_UQ); }
183 #endif
184  static Vc_INTRINSIC m256i Vc_CONST _mm256_setallone_si256() { return _mm256_castps_si256(_mm256_setallone()); }
185  static Vc_INTRINSIC m256d Vc_CONST _mm256_setallone_pd() { return _mm256_castps_pd(_mm256_setallone()); }
187 
188  static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi8 () { return _mm256_set1_epi8(1); }
190  static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
192  static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
194 
195  static Vc_INTRINSIC m256 Vc_CONST _mm256_setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); }
196  static Vc_INTRINSIC m256d Vc_CONST _mm256_setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); }
197 
198  static Vc_INTRINSIC m256d Vc_CONST _mm256_setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
199  static Vc_INTRINSIC m256 Vc_CONST _mm256_setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
200  static Vc_INTRINSIC m256d Vc_CONST _mm256_setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
201  static Vc_INTRINSIC m256 Vc_CONST _mm256_setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
202 
203  static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); }
204  static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
205 
206  //X static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi8 () { return _mm256_slli_epi8 (_mm256_setallone_si256(), 7); }
207  static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
208  static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
209  static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
210  static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
211 
212 #ifdef VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT
213 #define _mm_extract_epu8 (x, i) (static_cast<unsigned char> (_mm_extract_epi8 ((x), (i))))
214 #define _mm_extract_epu16(x, i) (static_cast<unsigned short>(_mm_extract_epi16((x), (i))))
215 #define _mm_extract_epu32(x, i) (static_cast<unsigned int> (_mm_extract_epi32((x), (i))))
216 #else
217  static Vc_INTRINSIC unsigned char Vc_CONST _mm_extract_epu8(param128i x, const int i) { return _mm_extract_epi8(x, i); }
218  static Vc_INTRINSIC unsigned short Vc_CONST _mm_extract_epu16(param128i x, const int i) { return _mm_extract_epi16(x, i); }
219  static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i) { return _mm_extract_epi32(x, i); }
220 #endif
221 
222  /////////////////////// COMPARE OPS ///////////////////////
223  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpeq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
224  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpneq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
225  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmplt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
226  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnlt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
227  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmple_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
228  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnle_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
229  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpord_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
230  static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpunord_pd(param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
231 
232  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpeq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
233  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpneq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
234  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmplt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
235  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnlt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
236  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
237  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmple_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
238  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnle_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
239  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpgt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
240  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpord_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
241  static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpunord_ps(param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
242 
243  static Vc_INTRINSIC m128i _mm_cmplt_epu16(param128i a, param128i b) {
244  return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
245  }
246  static Vc_INTRINSIC m128i _mm_cmpgt_epu16(param128i a, param128i b) {
247  return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
248  }
249 
250  /////////////////////// INTEGER OPS ///////////////////////
251 #define AVX_TO_SSE_2(name) \
252  static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, param256i b0) { \
253  m128i a1 = _mm256_extractf128_si256(a0, 1); \
254  m128i b1 = _mm256_extractf128_si256(b0, 1); \
255  m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
256  m128i r1 = _mm_##name(a1, b1); \
257  return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \
258  }
259 #define AVX_TO_SSE_2_si128_si256(name) \
260  static Vc_INTRINSIC m256i Vc_CONST _mm256_##name##_si256(param256i a0, param256i b0) { \
261  m128i a1 = _mm256_extractf128_si256(a0, 1); \
262  m128i b1 = _mm256_extractf128_si256(b0, 1); \
263  m128i r0 = _mm_##name##_si128(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
264  m128i r1 = _mm_##name##_si128(a1, b1); \
265  return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \
266  }
267 #define AVX_TO_SSE_1(name) \
268  static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0) { \
269  m128i a1 = _mm256_extractf128_si256(a0, 1); \
270  m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
271  m128i r1 = _mm_##name(a1); \
272  return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \
273  }
274 #define AVX_TO_SSE_1i(name) \
275  static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, const int i) { \
276  m128i a1 = _mm256_extractf128_si256(a0, 1); \
277  m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
278  m128i r1 = _mm_##name(a1, i); \
279  return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \
280  }
281 
282  AVX_TO_SSE_2(cmplt_epi8)
283  AVX_TO_SSE_2(cmplt_epi16)
284  AVX_TO_SSE_2(cmplt_epi32)
285  AVX_TO_SSE_2(cmpeq_epi8)
286  AVX_TO_SSE_2(cmpeq_epi16)
287  AVX_TO_SSE_2(cmpeq_epi32)
288  AVX_TO_SSE_2(cmpgt_epi8)
289  AVX_TO_SSE_2(cmpgt_epi16)
290  AVX_TO_SSE_2(cmpgt_epi32)
291 
292  // This code is AVX only (without AVX2). We never asked for AVX2 intrinsics. So go away... :)
293 #if defined _mm256_srli_si256
294 #undef _mm256_srli_si256
295 #endif
296 #if defined _mm256_slli_si256
297 #undef _mm256_slli_si256
298 #endif
299 #if defined _mm256_blend_epi16
300 #undef _mm256_blend_epi16
301 #endif
302  static Vc_INTRINSIC m256i Vc_CONST _mm256_srli_si256(param256i a0, const int i) {
303  const m128i vLo = _mm256_castsi256_si128(a0);
304  const m128i vHi = _mm256_extractf128_si256(a0, 1);
305  switch (i) {
306  case 0: return a0;
307  case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 1)), _mm_srli_si128(vHi, 1), 1);
308  case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 2)), _mm_srli_si128(vHi, 2), 1);
309  case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 3)), _mm_srli_si128(vHi, 3), 1);
310  case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 4)), _mm_srli_si128(vHi, 4), 1);
311  case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 5)), _mm_srli_si128(vHi, 5), 1);
312  case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 6)), _mm_srli_si128(vHi, 6), 1);
313  case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 7)), _mm_srli_si128(vHi, 7), 1);
314  case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 8)), _mm_srli_si128(vHi, 8), 1);
315  case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 9)), _mm_srli_si128(vHi, 9), 1);
316  case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 10)), _mm_srli_si128(vHi, 10), 1);
317  case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 11)), _mm_srli_si128(vHi, 11), 1);
318  case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 12)), _mm_srli_si128(vHi, 12), 1);
319  case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 13)), _mm_srli_si128(vHi, 13), 1);
320  case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 14)), _mm_srli_si128(vHi, 14), 1);
321  case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 15)), _mm_srli_si128(vHi, 15), 1);
322  case 16: return _mm256_permute2f128_si256(a0, a0, 0x81);
323  case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), 0x80);
324  case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), 0x80);
325  case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), 0x80);
326  case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), 0x80);
327  case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), 0x80);
328  case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), 0x80);
329  case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), 0x80);
330  case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), 0x80);
331  case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), 0x80);
332  case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), 0x80);
333  case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), 0x80);
334  case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), 0x80);
335  case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), 0x80);
336  case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), 0x80);
337  case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), 0x80);
338  }
339  return _mm256_setzero_si256();
340  }
341  static Vc_INTRINSIC m256i Vc_CONST _mm256_slli_si256(param256i a0, const int i) {
342  const m128i vLo = _mm256_castsi256_si128(a0);
343  const m128i vHi = _mm256_extractf128_si256(a0, 1);
344  switch (i) {
345  case 0: return a0;
346  case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm_alignr_epi8(vHi, vLo, 15), 1);
347  case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm_alignr_epi8(vHi, vLo, 14), 1);
348  case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm_alignr_epi8(vHi, vLo, 13), 1);
349  case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm_alignr_epi8(vHi, vLo, 12), 1);
350  case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm_alignr_epi8(vHi, vLo, 11), 1);
351  case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm_alignr_epi8(vHi, vLo, 10), 1);
352  case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm_alignr_epi8(vHi, vLo, 9), 1);
353  case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm_alignr_epi8(vHi, vLo, 8), 1);
354  case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm_alignr_epi8(vHi, vLo, 7), 1);
355  case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm_alignr_epi8(vHi, vLo, 6), 1);
356  case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm_alignr_epi8(vHi, vLo, 5), 1);
357  case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm_alignr_epi8(vHi, vLo, 4), 1);
358  case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm_alignr_epi8(vHi, vLo, 3), 1);
359  case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm_alignr_epi8(vHi, vLo, 2), 1);
360  case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm_alignr_epi8(vHi, vLo, 1), 1);
361  case 16: return _mm256_permute2f128_si256(a0, a0, 0x8);
362  case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), 0x8);
363  case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), 0x8);
364  case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), 0x8);
365  case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), 0x8);
366  case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), 0x8);
367  case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), 0x8);
368  case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), 0x8);
369  case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), 0x8);
370  case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), 0x8);
371  case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), 0x8);
372  case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), 0x8);
373  case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), 0x8);
374  case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), 0x8);
375  case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), 0x8);
376  case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), 0x8);
377  }
378  return _mm256_setzero_si256();
379  }
380 
381  static Vc_INTRINSIC m256i Vc_CONST _mm256_and_si256(param256i x, param256i y) {
382  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
383  }
384  static Vc_INTRINSIC m256i Vc_CONST _mm256_andnot_si256(param256i x, param256i y) {
385  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
386  }
387  static Vc_INTRINSIC m256i Vc_CONST _mm256_or_si256(param256i x, param256i y) {
388  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
389  }
390  static Vc_INTRINSIC m256i Vc_CONST _mm256_xor_si256(param256i x, param256i y) {
391  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
392  }
393 
394  AVX_TO_SSE_2(packs_epi16)
395  AVX_TO_SSE_2(packs_epi32)
396  AVX_TO_SSE_2(packus_epi16)
397  AVX_TO_SSE_2(unpackhi_epi8)
398  AVX_TO_SSE_2(unpackhi_epi16)
399  AVX_TO_SSE_2(unpackhi_epi32)
400  AVX_TO_SSE_2(unpackhi_epi64)
401  AVX_TO_SSE_2(unpacklo_epi8)
402  AVX_TO_SSE_2(unpacklo_epi16)
403  AVX_TO_SSE_2(unpacklo_epi32)
404  AVX_TO_SSE_2(unpacklo_epi64)
405  AVX_TO_SSE_2(add_epi8)
406  AVX_TO_SSE_2(add_epi16)
407  AVX_TO_SSE_2(add_epi32)
408  AVX_TO_SSE_2(add_epi64)
409  AVX_TO_SSE_2(adds_epi8)
410  AVX_TO_SSE_2(adds_epi16)
411  AVX_TO_SSE_2(adds_epu8)
412  AVX_TO_SSE_2(adds_epu16)
413  AVX_TO_SSE_2(sub_epi8)
414  AVX_TO_SSE_2(sub_epi16)
415  AVX_TO_SSE_2(sub_epi32)
416  AVX_TO_SSE_2(sub_epi64)
417  AVX_TO_SSE_2(subs_epi8)
418  AVX_TO_SSE_2(subs_epi16)
419  AVX_TO_SSE_2(subs_epu8)
420  AVX_TO_SSE_2(subs_epu16)
421  AVX_TO_SSE_2(madd_epi16)
422  AVX_TO_SSE_2(mulhi_epi16)
423  AVX_TO_SSE_2(mullo_epi16)
424  AVX_TO_SSE_2(mul_epu32)
425  AVX_TO_SSE_1i(slli_epi16)
426  AVX_TO_SSE_1i(slli_epi32)
427  AVX_TO_SSE_1i(slli_epi64)
428  AVX_TO_SSE_1i(srai_epi16)
429  AVX_TO_SSE_1i(srai_epi32)
430  AVX_TO_SSE_1i(srli_epi16)
431  AVX_TO_SSE_1i(srli_epi32)
432  AVX_TO_SSE_1i(srli_epi64)
433  AVX_TO_SSE_2(sll_epi16)
434  AVX_TO_SSE_2(sll_epi32)
435  AVX_TO_SSE_2(sll_epi64)
436  AVX_TO_SSE_2(sra_epi16)
437  AVX_TO_SSE_2(sra_epi32)
438  AVX_TO_SSE_2(srl_epi16)
439  AVX_TO_SSE_2(srl_epi32)
440  AVX_TO_SSE_2(srl_epi64)
441  AVX_TO_SSE_2(max_epi16)
442  AVX_TO_SSE_2(max_epu8)
443  AVX_TO_SSE_2(min_epi16)
444  AVX_TO_SSE_2(min_epu8)
445  Vc_INTRINSIC int Vc_CONST _mm256_movemask_epi8(param256i a0)
446  {
447  m128i a1 = _mm256_extractf128_si256(a0, 1);
448  return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
449  }
450  AVX_TO_SSE_2(mulhi_epu16)
451  // shufflehi_epi16
452  // shufflelo_epi16 (param128i __A, const int __mask)
453  // shuffle_epi32 (param128i __A, const int __mask)
454  // maskmoveu_si128 (param128i __A, param128i __B, char *__C)
455  AVX_TO_SSE_2(avg_epu8)
456  AVX_TO_SSE_2(avg_epu16)
457  AVX_TO_SSE_2(sad_epu8)
458  // stream_si32 (int *__A, int __B)
459  // stream_si128 (param128i *__A, param128i __B)
460  // cvtsi32_si128 (int __A)
461  // cvtsi64_si128 (long long __A)
462  // cvtsi64x_si128 (long long __A)
463  AVX_TO_SSE_2(hadd_epi16)
464  AVX_TO_SSE_2(hadd_epi32)
465  AVX_TO_SSE_2(hadds_epi16)
466  AVX_TO_SSE_2(hsub_epi16)
467  AVX_TO_SSE_2(hsub_epi32)
468  AVX_TO_SSE_2(hsubs_epi16)
469  AVX_TO_SSE_2(maddubs_epi16)
470  AVX_TO_SSE_2(mulhrs_epi16)
471  AVX_TO_SSE_2(shuffle_epi8)
472  AVX_TO_SSE_2(sign_epi8)
473  AVX_TO_SSE_2(sign_epi16)
474  AVX_TO_SSE_2(sign_epi32)
475  // alignr_epi8(param128i __X, param128i __Y, const int __N)
476  AVX_TO_SSE_1(abs_epi8)
477  AVX_TO_SSE_1(abs_epi16)
478  AVX_TO_SSE_1(abs_epi32)
479 #if !defined(VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT)
480  m256i Vc_INTRINSIC Vc_CONST _mm256_blend_epi16(param256i a0, param256i b0, const int m) {
481  m128i a1 = _mm256_extractf128_si256(a0, 1);
482  m128i b1 = _mm256_extractf128_si256(b0, 1);
483  m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
484  m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
485  return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1);
486  }
487 #else
488 # define _mm256_blend_epi16(a0, b0, m) \
489  _mm256_insertf128_si256( \
490  _mm256_castsi128_si256( \
491  _mm_blend_epi16( \
492  _mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff)), \
493  _mm_blend_epi16(_mm256_extractf128_si256(a0, 1), _mm256_extractf128_si256(b0, 1), m >> 8);, 1)
494 #endif
495  Vc_INTRINSIC m256i Vc_CONST _mm256_blendv_epi8(param256i a0, param256i b0, param256i m0) {
496  m128i a1 = _mm256_extractf128_si256(a0, 1);
497  m128i b1 = _mm256_extractf128_si256(b0, 1);
498  m128i m1 = _mm256_extractf128_si256(m0, 1);
499  m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
500  m128i r1 = _mm_blendv_epi8(a1, b1, m1);
501  return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1);
502  }
503  AVX_TO_SSE_2(cmpeq_epi64)
504  AVX_TO_SSE_2(min_epi8)
505  AVX_TO_SSE_2(max_epi8)
506  AVX_TO_SSE_2(min_epu16)
507  AVX_TO_SSE_2(max_epu16)
508  AVX_TO_SSE_2(min_epi32)
509  AVX_TO_SSE_2(max_epi32)
510  AVX_TO_SSE_2(min_epu32)
511  AVX_TO_SSE_2(max_epu32)
512  AVX_TO_SSE_2(mullo_epi32)
513  AVX_TO_SSE_2(mul_epi32)
514 #if !defined(VC_CLANG) || VC_CLANG > 0x30100
515  // clang is missing _mm_minpos_epu16 from smmintrin.h
516  // http://llvm.org/bugs/show_bug.cgi?id=12399
517  AVX_TO_SSE_1(minpos_epu16)
518 #endif
519  AVX_TO_SSE_1(cvtepi8_epi32)
520  AVX_TO_SSE_1(cvtepi16_epi32)
521  AVX_TO_SSE_1(cvtepi8_epi64)
522  AVX_TO_SSE_1(cvtepi32_epi64)
523  AVX_TO_SSE_1(cvtepi16_epi64)
524  AVX_TO_SSE_1(cvtepi8_epi16)
525  AVX_TO_SSE_1(cvtepu8_epi32)
526  AVX_TO_SSE_1(cvtepu16_epi32)
527  AVX_TO_SSE_1(cvtepu8_epi64)
528  AVX_TO_SSE_1(cvtepu32_epi64)
529  AVX_TO_SSE_1(cvtepu16_epi64)
530  AVX_TO_SSE_1(cvtepu8_epi16)
531  AVX_TO_SSE_2(packus_epi32)
532  // mpsadbw_epu8 (param128i __X, param128i __Y, const int __M)
533  // stream_load_si128 (param128i *__X)
534  AVX_TO_SSE_2(cmpgt_epi64)
535 
536 //X static Vc_INTRINSIC m256i _mm256_cmplt_epu8 (param256i a, param256i b) { return _mm256_cmplt_epi8 (
537 //X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); }
538 //X static Vc_INTRINSIC m256i _mm256_cmpgt_epu8 (param256i a, param256i b) { return _mm256_cmpgt_epi8 (
539 //X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); }
540  static Vc_INTRINSIC m256i Vc_CONST _mm256_cmplt_epu32(param256i _a, param256i _b) {
541  m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32())));
542  m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32())));
543  return _mm256_insertf128_si256(_mm256_castsi128_si256(
544  _mm_cmplt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))),
545  _mm_cmplt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1);
546  }
547  static Vc_INTRINSIC m256i Vc_CONST _mm256_cmpgt_epu32(param256i _a, param256i _b) {
548  m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32())));
549  m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32())));
550  return _mm256_insertf128_si256(_mm256_castsi128_si256(
551  _mm_cmpgt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))),
552  _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1);
553  }
554 
555  static Vc_INTRINSIC void _mm256_maskstore(float *mem, const param256 mask, const param256 v) {
556 #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE
557  _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
558 #else
559  _mm256_maskstore_ps(mem, mask, v);
560 #endif
561  }
562  static Vc_INTRINSIC void _mm256_maskstore(double *mem, const param256d mask, const param256d v) {
563 #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE
564  _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
565 #else
566  _mm256_maskstore_pd(mem, mask, v);
567 #endif
568  }
569  static Vc_INTRINSIC void _mm256_maskstore(int *mem, const param256i mask, const param256i v) {
570 #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE
571  _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
572 #else
573  _mm256_maskstore_ps(reinterpret_cast<float *>(mem), _mm256_castsi256_ps(mask), _mm256_castsi256_ps(v));
574 #endif
575  }
576  static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const param256i mask, const param256i v) {
577  _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
578  }
579 
580 #if defined(VC_IMPL_FMA4) && defined(VC_CLANG) && VC_CLANG < 0x30300
581  // clang miscompiles _mm256_macc_ps: http://llvm.org/bugs/show_bug.cgi?id=15040
582  static Vc_INTRINSIC __m256 my256_macc_ps(__m256 a, __m256 b, __m256 c) {
583  __m256 r;
584  // avoid loading c from memory as that would trigger the bug
585  asm("vfmaddps %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c));
586  return r;
587  }
588 #ifdef _mm256_macc_ps
589 #undef _mm256_macc_ps
590 #endif
591 #define _mm256_macc_ps(a, b, c) Vc::AVX::my256_macc_ps(a, b, c)
592 
593  static Vc_INTRINSIC __m256d my256_macc_pd(__m256d a, __m256d b, __m256d c) {
594  __m256d r;
595  // avoid loading c from memory as that would trigger the bug
596  asm("vfmaddpd %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c));
597  return r;
598  }
599 #ifdef _mm256_macc_pd
600 #undef _mm256_macc_pd
601 #endif
602 #define _mm256_macc_pd(a, b, c) Vc::AVX::my256_macc_pd(a, b, c)
603 #endif
604 } // namespace AVX
605 } // namespace Vc
606 } // namespace ROOT
607 #include "undomacros.h"
608 
609 #include "shuffle.h"
610 
611 #endif // VC_AVX_INTRINSICS_H
static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi32()
Definition: intrinsics.h:210
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi32()
Definition: intrinsics.h:192
m256i Vc_INTRINSIC Vc_CONST _mm256_blend_epi16(param256i a0, param256i b0, const int m)
Definition: intrinsics.h:480
const m256d param256d
Definition: intrinsics.h:129
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi8()
Definition: intrinsics.h:188
static Vc_INTRINSIC m256 Vc_CONST _mm256_setabsmask_ps()
Definition: intrinsics.h:199
static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i)
Definition: intrinsics.h:219
static Vc_INTRINSIC m256i Vc_CONST _mm256_andnot_si256(param256i x, param256i y)
Definition: intrinsics.h:384
Namespace for new ROOT classes and functions.
Definition: ROOT.py:1
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnle_ps(param256 a, param256 b)
Definition: intrinsics.h:238
static Vc_INTRINSIC m128i _mm_cmplt_epu16(param128i a, param128i b)
Definition: intrinsics.h:243
static Vc_INTRINSIC m128i Vc_CONST _mm_setallone_si128()
Definition: intrinsics.h:165
static Vc_INTRINSIC m128i Vc_CONST _mm_setallone()
Definition: intrinsics.h:163
static Vc_INTRINSIC m256i Vc_CONST _mm256_slli_si256(param256i a0, const int i)
Definition: intrinsics.h:341
double T(double x)
Definition: ChebyshevPol.h:34
const m128 param128
Definition: intrinsics.h:125
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16()
Definition: intrinsics.h:207
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32()
Definition: intrinsics.h:208
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu16()
Definition: intrinsics.h:191
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi16()
Definition: intrinsics.h:190
static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32()
Definition: intrinsics.h:204
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps(param256 a, param256 b)
Definition: intrinsics.h:236
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16()
Definition: intrinsics.h:171
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8()
Definition: intrinsics.h:169
__m128i m128i
Definition: intrinsics.h:112
__m256d m256d
Definition: intrinsics.h:114
static Vc_INTRINSIC unsigned char Vc_CONST _mm_extract_epu8(param128i x, const int i)
Definition: intrinsics.h:217
TArc * a
Definition: textangle.C:12
#define Vc_INTRINSIC
Definition: macros.h:139
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu8()
Definition: intrinsics.h:189
static Vc_INTRINSIC m256d Vc_CONST _mm256_setsignmask_pd()
Definition: intrinsics.h:200
Vc_INTRINSIC m256i Vc_CONST _mm256_blendv_epi8(param256i a0, param256i b0, param256i m0)
Definition: intrinsics.h:495
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpunord_pd(param256d a, param256d b)
Definition: intrinsics.h:230
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpeq_pd(param256d a, param256d b)
Definition: intrinsics.h:223
Double_t x[n]
Definition: legend1.C:17
static Vc_INTRINSIC unsigned short Vc_CONST _mm_extract_epu16(param128i x, const int i)
Definition: intrinsics.h:218
static Vc_INTRINSIC m128d Vc_CONST _mm_setallone_pd()
Definition: intrinsics.h:166
const m256i param256i
Definition: intrinsics.h:130
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16()
Definition: intrinsics.h:172
static Vc_INTRINSIC m256d Vc_CONST _mm256_setabsmask_pd()
Definition: intrinsics.h:198
static Vc_INTRINSIC m256i Vc_CONST _mm256_cmplt_epu32(param256i _a, param256i _b)
Definition: intrinsics.h:540
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnle_pd(param256d a, param256d b)
Definition: intrinsics.h:228
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpneq_pd(param256d a, param256d b)
Definition: intrinsics.h:224
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmple_ps(param256 a, param256 b)
Definition: intrinsics.h:237
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpgt_ps(param256 a, param256 b)
Definition: intrinsics.h:239
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmplt_ps(param256 a, param256 b)
Definition: intrinsics.h:234
#define AVX
Definition: global.h:90
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32()
Definition: intrinsics.h:173
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpunord_ps(param256 a, param256 b)
Definition: intrinsics.h:241
static Vc_INTRINSIC m256 Vc_CONST _mm256_set1_ps(float a)
Definition: intrinsics.h:155
ROOT::R::TRInterface & r
Definition: Object.C:4
#define AVX_TO_SSE_1(name)
Definition: intrinsics.h:267
SVector< double, 2 > v
Definition: Dict.h:5
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmple_pd(param256d a, param256d b)
Definition: intrinsics.h:227
static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps()
Definition: intrinsics.h:203
unsigned int r1[N_CITIES]
Definition: simanTSP.cxx:321
__m256i m256i
Definition: intrinsics.h:115
TMarker * m
Definition: textangle.C:8
#define Vc_CONST
Definition: macros.h:133
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpneq_ps(param256 a, param256 b)
Definition: intrinsics.h:233
static Vc_INTRINSIC m256d Vc_CONST _mm256_set1_pd(double a)
Definition: intrinsics.h:156
AVX_TO_SSE_1i(slli_epi16) AVX_TO_SSE_1i(slli_epi32) AVX_TO_SSE_1i(slli_epi64) AVX_TO_SSE_1i(srai_epi16) AVX_TO_SSE_1i(srai_epi32) AVX_TO_SSE_1i(srli_epi16) AVX_TO_SSE_1i(srli_epi32) AVX_TO_SSE_1i(srli_epi64) Vc_INTRINSIC int Vc_CONST _mm256_movemask_epi8(param256i a0)
Definition: intrinsics.h:425
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu32()
Definition: intrinsics.h:193
static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone()
Definition: intrinsics.h:182
static Vc_INTRINSIC m128i _mm_cmpgt_epu16(param128i a, param128i b)
Definition: intrinsics.h:246
static Vc_INTRINSIC m256i Vc_CONST _mm256_srli_si256(param256i a0, const int i)
Definition: intrinsics.h:302
#define Vc_ALWAYS_INLINE
Definition: macros.h:130
static Vc_INTRINSIC m256i Vc_CONST _mm256_cmpgt_epu32(param256i _a, param256i _b)
Definition: intrinsics.h:547
static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epi32(int a)
Definition: intrinsics.h:157
Double_t y[n]
Definition: legend1.C:17
static Vc_INTRINSIC m256 Vc_CONST _mm256_setsignmask_ps()
Definition: intrinsics.h:201
const m128d param128d
Definition: intrinsics.h:126
static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone_ps()
Definition: intrinsics.h:186
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpeq_ps(param256 a, param256 b)
Definition: intrinsics.h:232
static Vc_INTRINSIC m128 Vc_CONST _mm_setallone_ps()
Definition: intrinsics.h:167
Binding & operator=(OUT(*fun)(void))
static Vc_INTRINSIC m256i Vc_CONST _mm256_and_si256(param256i x, param256i y)
Definition: intrinsics.h:381
static Vc_INTRINSIC m256d Vc_CONST _mm256_setallone_pd()
Definition: intrinsics.h:185
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpord_ps(param256 a, param256 b)
Definition: intrinsics.h:240
static Vc_INTRINSIC void _mm256_maskstore(float *mem, const param256 mask, const param256 v)
Definition: intrinsics.h:555
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnlt_pd(param256d a, param256d b)
Definition: intrinsics.h:226
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8()
Definition: intrinsics.h:170
static Vc_INTRINSIC m256 Vc_CONST _mm256_setone_ps()
Definition: intrinsics.h:195
static Vc_INTRINSIC m256i Vc_CONST _mm256_xor_si256(param256i x, param256i y)
Definition: intrinsics.h:390
static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi16()
Definition: intrinsics.h:209
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmplt_pd(param256d a, param256d b)
Definition: intrinsics.h:225
Definition: casts.h:28
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpord_pd(param256d a, param256d b)
Definition: intrinsics.h:229
static Vc_INTRINSIC m256i Vc_CONST _mm256_or_si256(param256i x, param256i y)
Definition: intrinsics.h:387
static Vc_INTRINSIC m256d Vc_CONST _mm256_setone_pd()
Definition: intrinsics.h:196
const m256 param256
Definition: intrinsics.h:128
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnlt_ps(param256 a, param256 b)
Definition: intrinsics.h:235
__m128d m128d
Definition: intrinsics.h:111
#define AVX_TO_SSE_2(name)
Definition: intrinsics.h:251
static Vc_INTRINSIC m256i Vc_CONST _mm256_setallone_si256()
Definition: intrinsics.h:184
const m128i param128i
Definition: intrinsics.h:127