ROOT  6.06/09
Reference Guide
intrinsics.h
Go to the documentation of this file.
1 /* This file is part of the Vc library.
2 
3  Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4 
5  Vc is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as
7  published by the Free Software Foundation, either version 3 of
8  the License, or (at your option) any later version.
9 
10  Vc is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 
18 */
19 
20 #ifndef SSE_INTRINSICS_H
21 #define SSE_INTRINSICS_H
22 
23 #include "../common/windows_fix_intrin.h"
24 
25 // The GCC xxxintrin.h headers do not make sure that the intrinsics have C linkage. This not really
26 // a problem, unless there is another place where the exact same functions are declared. Then the
27 // linkage must be the same, otherwise it won't compile. Such a case occurs on Windows, where the
28 // intrin.h header (included indirectly via unistd.h) declares many SSE intrinsics again.
29 extern "C" {
30 // MMX
31 #include <mmintrin.h>
32 // SSE
33 #include <xmmintrin.h>
34 // SSE2
35 #include <emmintrin.h>
36 }
37 
38 #include "../common/fix_clang_emmintrin.h"
39 
40 #include "const_data.h"
41 #include <cstdlib>
42 #include "macros.h"
43 
44 #ifdef __3dNOW__
45 extern "C" {
46 #include <mm3dnow.h>
47 }
48 #endif
49 
50 namespace ROOT {
51 namespace Vc
52 {
53 namespace SSE
54 {
56 
57 #if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT)
58  static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
59  static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
60  static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
61  static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
62  static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
63  static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
64 #endif
65 
66 #ifdef VC_GCC
67  // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
68  // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
69  static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
70  static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
71  static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
72  static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
73  static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
74  static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
75 #endif
76 
77 #if defined(VC_GNU_ASM) && !defined(NVALGRIND)
78  static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; }
79 #else
80  static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); }
81 #endif
82  static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); }
83  static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); }
84  static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); }
85 
86  static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); }
87  static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); }
88  static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
89  static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
90  static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
91  static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
92 
93  static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
94  static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
95 
96  static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
97  static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
98  static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
99  static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
100 
101  //X static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); }
102  static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
103  static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
104 
105  //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 (
106  //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
107  //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 (
108  //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
109  static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(
110  _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
111  static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(
112  _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
113  static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32(
114  _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
115  static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(
116  _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
117 } // namespace SSE
118 } // namespace Vc
119 } // namespace ROOT
120 
121 // SSE3
122 #ifdef VC_IMPL_SSE3
123 extern "C" {
124 #include <pmmintrin.h>
125 }
126 #endif
127 // SSSE3
128 #ifdef VC_IMPL_SSSE3
129 extern "C" {
130 #include <tmmintrin.h>
131 }
132 #define mm_abs_epi8 _mm_abs_epi8
133 #define mm_abs_epi16 _mm_abs_epi16
134 #define mm_abs_epi32 _mm_abs_epi32
135 #define mm_alignr_epi8 _mm_alignr_epi8
136 namespace ROOT {
137 namespace Vc
138 {
139 namespace SSE
140 {
141 
142  // not overriding _mm_set1_epi8 because this one should only be used for non-constants
143  static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) {
144 #if defined(VC_GCC) && VC_GCC < 0x40500
145  return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128());
146 #else
147  // GCC 4.5 nows about the pshufb improvement
148  return _mm_set1_epi8(a);
149 #endif
150  }
151 
152 } // namespace SSE
153 } // namespace Vc
154 } // namespace ROOT
155 #else
156 namespace ROOT {
157 namespace Vc
158 {
159 namespace SSE
160 {
161  static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi8 (__m128i a) {
162  __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
163  return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8()));
164  }
165  // positive value:
166  // negative == 0
167  // a unchanged after xor
168  // 0 >> 31 -> 0
169  // a + 0 -> a
170  // negative value:
171  // negative == -1
172  // a xor -1 -> -a - 1
173  // -1 >> 31 -> 1
174  // -a - 1 + 1 -> -a
175  static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi16(__m128i a) {
176  __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
177  return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
178  }
179  static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi32(__m128i a) {
180  __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
181  return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
182  }
183  static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) {
184  return _mm_set1_epi8(a);
185  }
186  static Vc_INTRINSIC __m128i Vc_CONST mm_alignr_epi8(__m128i a, __m128i b, const int s) {
187  switch (s) {
188  case 0: return b;
189  case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
190  case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
191  case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
192  case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
193  case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
194  case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
195  case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
196  case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
197  case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
198  case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
199  case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
200  case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
201  case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
202  case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
203  case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
204  case 16: return a;
205  case 17: return _mm_srli_si128(a, 1);
206  case 18: return _mm_srli_si128(a, 2);
207  case 19: return _mm_srli_si128(a, 3);
208  case 20: return _mm_srli_si128(a, 4);
209  case 21: return _mm_srli_si128(a, 5);
210  case 22: return _mm_srli_si128(a, 6);
211  case 23: return _mm_srli_si128(a, 7);
212  case 24: return _mm_srli_si128(a, 8);
213  case 25: return _mm_srli_si128(a, 9);
214  case 26: return _mm_srli_si128(a, 10);
215  case 27: return _mm_srli_si128(a, 11);
216  case 28: return _mm_srli_si128(a, 12);
217  case 29: return _mm_srli_si128(a, 13);
218  case 30: return _mm_srli_si128(a, 14);
219  case 31: return _mm_srli_si128(a, 15);
220  }
221  return _mm_setzero_si128();
222  }
223 
224 } // namespace SSE
225 } // namespace Vc
226 } // namespace ROOT
227 
228 #endif
229 
230 // SSE4.1
231 #ifdef VC_IMPL_SSE4_1
232 extern "C" {
233 #include <smmintrin.h>
234 }
235 namespace ROOT {
236 namespace Vc
237 {
238 namespace SSE
239 {
240 #define mm_blendv_pd _mm_blendv_pd
241 #define mm_blendv_ps _mm_blendv_ps
242 #define mm_blendv_epi8 _mm_blendv_epi8
243 #define mm_blend_epi16 _mm_blend_epi16
244 #define mm_blend_ps _mm_blend_ps
245 #define mm_blend_pd _mm_blend_pd
246 
247 #define mm_min_epi32 _mm_min_epi32
248 #define mm_max_epi32 _mm_max_epi32
249 #define mm_min_epu32 _mm_min_epu32
250 #define mm_max_epu32 _mm_max_epu32
251 //#define mm_min_epi16 _mm_min_epi16
252 //#define mm_max_epi16 _mm_max_epi16
253 #define mm_min_epu16 _mm_min_epu16
254 #define mm_max_epu16 _mm_max_epu16
255 #define mm_min_epi8 _mm_min_epi8
256 #define mm_max_epi8 _mm_max_epi8
257 
258 #define mm_cvtepu16_epi32 _mm_cvtepu16_epi32
259 #define mm_cvtepu8_epi16 _mm_cvtepu8_epi16
260 #define mm_cvtepi8_epi16 _mm_cvtepi8_epi16
261 #define mm_cvtepu16_epi32 _mm_cvtepu16_epi32
262 #define mm_cvtepi16_epi32 _mm_cvtepi16_epi32
263 #define mm_cvtepu8_epi32 _mm_cvtepu8_epi32
264 #define mm_cvtepi8_epi32 _mm_cvtepi8_epi32
265 #define mm_stream_load_si128 _mm_stream_load_si128
266 // TODO
267 } // namespace SSE
268 } // namespace Vc
269 } // namespace ROOT
270 #else
271 namespace ROOT {
272 namespace Vc
273 {
274 namespace SSE
275 {
276  static Vc_INTRINSIC __m128d mm_blendv_pd(__m128d a, __m128d b, __m128d c) {
277  return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
278  }
279  static Vc_INTRINSIC __m128 mm_blendv_ps(__m128 a, __m128 b, __m128 c) {
280  return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
281  }
282  static Vc_INTRINSIC __m128i mm_blendv_epi8(__m128i a, __m128i b, __m128i c) {
283  return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
284  }
285 
286  // only use the following blend functions with immediates as mask and, of course, compiling
287  // with optimization
288  static Vc_INTRINSIC __m128d mm_blend_pd(__m128d a, __m128d b, const int mask) {
289  switch (mask) {
290  case 0x0:
291  return a;
292  case 0x1:
293  return _mm_shuffle_pd(b, a, 2);
294  case 0x2:
295  return _mm_shuffle_pd(a, b, 2);
296  case 0x3:
297  return b;
298  default:
299  abort();
300  return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
301  }
302  }
303  static Vc_INTRINSIC __m128 mm_blend_ps(__m128 a, __m128 b, const int mask) {
304  __m128i c;
305  switch (mask) {
306  case 0x0:
307  return a;
308  case 0x1:
309  c = _mm_srli_si128(_mm_setallone_si128(), 12);
310  break;
311  case 0x2:
312  c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
313  break;
314  case 0x3:
315  c = _mm_srli_si128(_mm_setallone_si128(), 8);
316  break;
317  case 0x4:
318  c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
319  break;
320  case 0x5:
321  c = _mm_set_epi32(0, -1, 0, -1);
322  break;
323  case 0x6:
324  c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
325  break;
326  case 0x7:
327  c = _mm_srli_si128(_mm_setallone_si128(), 4);
328  break;
329  case 0x8:
330  c = _mm_slli_si128(_mm_setallone_si128(), 12);
331  break;
332  case 0x9:
333  c = _mm_set_epi32(-1, 0, 0, -1);
334  break;
335  case 0xa:
336  c = _mm_set_epi32(-1, 0, -1, 0);
337  break;
338  case 0xb:
339  c = _mm_set_epi32(-1, 0, -1, -1);
340  break;
341  case 0xc:
342  c = _mm_slli_si128(_mm_setallone_si128(), 8);
343  break;
344  case 0xd:
345  c = _mm_set_epi32(-1, -1, 0, -1);
346  break;
347  case 0xe:
348  c = _mm_slli_si128(_mm_setallone_si128(), 4);
349  break;
350  case 0xf:
351  return b;
352  default: // may not happen
353  abort();
354  c = _mm_setzero_si128();
355  break;
356  }
357  __m128 _c = _mm_castsi128_ps(c);
358  return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
359  }
360  static Vc_INTRINSIC __m128i mm_blend_epi16(__m128i a, __m128i b, const int mask) {
361  __m128i c;
362  switch (mask) {
363  case 0x00:
364  return a;
365  case 0x01:
366  c = _mm_srli_si128(_mm_setallone_si128(), 14);
367  break;
368  case 0x03:
369  c = _mm_srli_si128(_mm_setallone_si128(), 12);
370  break;
371  case 0x07:
372  c = _mm_srli_si128(_mm_setallone_si128(), 10);
373  break;
374  case 0x0f:
375  return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
376  case 0x1f:
377  c = _mm_srli_si128(_mm_setallone_si128(), 6);
378  break;
379  case 0x3f:
380  c = _mm_srli_si128(_mm_setallone_si128(), 4);
381  break;
382  case 0x7f:
383  c = _mm_srli_si128(_mm_setallone_si128(), 2);
384  break;
385  case 0x80:
386  c = _mm_slli_si128(_mm_setallone_si128(), 14);
387  break;
388  case 0xc0:
389  c = _mm_slli_si128(_mm_setallone_si128(), 12);
390  break;
391  case 0xe0:
392  c = _mm_slli_si128(_mm_setallone_si128(), 10);
393  break;
394  case 0xf0:
395  c = _mm_slli_si128(_mm_setallone_si128(), 8);
396  break;
397  case 0xf8:
398  c = _mm_slli_si128(_mm_setallone_si128(), 6);
399  break;
400  case 0xfc:
401  c = _mm_slli_si128(_mm_setallone_si128(), 4);
402  break;
403  case 0xfe:
404  c = _mm_slli_si128(_mm_setallone_si128(), 2);
405  break;
406  case 0xff:
407  return b;
408  case 0xcc:
409  return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
410  case 0x33:
411  return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
412  default:
413  const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
414  c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
415  break;
416  }
417  return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
418  }
419 
420  static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi8 (__m128i a, __m128i b) {
421  return mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
422  }
423  static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi32(__m128i a, __m128i b) {
424  return mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
425  }
426 //X static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu8 (__m128i a, __m128i b) {
427 //X return mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b));
428 //X }
429  static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu16(__m128i a, __m128i b) {
430  return mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b));
431  }
432  static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu32(__m128i a, __m128i b) {
433  return mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b));
434  }
435 //X static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu8 (__m128i a, __m128i b) {
436 //X return mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b));
437 //X }
438  static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu16(__m128i a, __m128i b) {
439  return mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b));
440  }
441  static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu32(__m128i a, __m128i b) {
442  return mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b));
443  }
444  static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi8 (__m128i a, __m128i b) {
445  return mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
446  }
447  static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi32(__m128i a, __m128i b) {
448  return mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
449  }
450  static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi16(__m128i epu8) {
451  return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
452  }
453  static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi16(__m128i epi8) {
454  return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
455  }
456  static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu16_epi32(__m128i epu16) {
457  return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
458  }
459  static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi16_epi32(__m128i epu16) {
460  return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
461  }
462  static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi32(__m128i epu8) {
463  return mm_cvtepu16_epi32(mm_cvtepu8_epi16(epu8));
464  }
465  static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi32(__m128i epi8) {
466  const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
467  const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
468  return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
469  }
470  static Vc_INTRINSIC Vc_PURE __m128i mm_stream_load_si128(__m128i *mem) {
471  return _mm_load_si128(mem);
472  }
473 
474 } // namespace SSE
475 } // namespace Vc
476 } // namespace ROOT
477 #endif
478 
479 #ifdef VC_IMPL_POPCNT
480 #include <popcntintrin.h>
481 #endif
482 
483 // SSE4.2
484 #ifdef VC_IMPL_SSE4_2
485 extern "C" {
486 #include <nmmintrin.h>
487 }
488 #endif
489 
490 namespace ROOT {
491 namespace Vc
492 {
493 namespace SSE
494 {
495  static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) {
496  float f = 0.;
497  switch (i) {
498  case 0:
499  f = _mm_cvtss_f32(v);
500  break;
501 #if defined VC_IMPL_SSE4_1 && !defined VC_MSVC
502  default:
503 #ifdef VC_GCC
504  f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i));
505 #else
506  // MSVC fails to compile this because it can't optimize i to an immediate
507  _MM_EXTRACT_FLOAT(f, v, i);
508 #endif
509  break;
510 #else
511  case 1:
512  f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4)));
513  break;
514  case 2:
515  f = _mm_cvtss_f32(_mm_movehl_ps(v, v));
516  break;
517  case 3:
518  f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12)));
519  break;
520 #endif
521  }
522  return f;
523  }
524  static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) {
525  if (i == 0) {
526  return _mm_cvtsd_f64(v);
527  }
528  return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v))));
529  }
530  static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) {
531 #ifdef VC_GCC
532  if (__builtin_constant_p(i)) {
533  return extract_float_imm(v, i);
534 //X if (index <= 1) {
535 //X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v));
536 //X if (index == 0) tmp &= 0xFFFFFFFFull;
537 //X if (index == 1) tmp >>= 32;
538 //X return Common::AliasingEntryHelper<EntryType>(tmp);
539 //X }
540  } else {
541  typedef float float4[4] Vc_MAY_ALIAS;
542  const float4 &data = reinterpret_cast<const float4 &>(v);
543  return data[i];
544  }
545 #else
546  union { __m128 v; float m[4]; } u;
547  u.v = v;
548  return u.m[i];
549 #endif
550  }
551 
552  static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
553 #ifdef VC_IMPL_SSE4_1
554  return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
555 #else
556  return _mm_load_ps(mem);
557 #endif
558  }
559  static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
560 #ifdef VC_IMPL_SSE4_1
561  return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
562 #else
563  return _mm_load_pd(mem);
564 #endif
565  }
566  static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
567 #ifdef VC_IMPL_SSE4_1
568  return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
569 #else
570  return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
571 #endif
572  }
573  static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
574  return _mm_stream_load(reinterpret_cast<const int *>(mem));
575  }
576  static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
577  return _mm_stream_load(reinterpret_cast<const int *>(mem));
578  }
579  static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
580  return _mm_stream_load(reinterpret_cast<const int *>(mem));
581  }
582  static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
583  return _mm_stream_load(reinterpret_cast<const int *>(mem));
584  }
585  static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
586  return _mm_stream_load(reinterpret_cast<const int *>(mem));
587  }
588 } // namespace SSE
589 } // namespace Vc
590 } // namespace ROOT
591 
592 // XOP / FMA4
593 #if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)
594 extern "C" {
595 #include <x86intrin.h>
596 }
597 #endif
598 
599 #include "undomacros.h"
600 #include "shuffle.h"
601 
602 #endif // SSE_INTRINSICS_H
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi16_epi32(__m128i epu16)
Definition: intrinsics.h:459
static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i)
Definition: intrinsics.h:495
Namespace for new ROOT classes and functions.
Definition: ROOT.py:1
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi8(__m128i a, __m128i b)
Definition: intrinsics.h:444
static Vc_INTRINSIC __m128d mm_blend_pd(__m128d a, __m128d b, const int mask)
Definition: intrinsics.h:288
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16()
Definition: intrinsics.h:89
static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd()
Definition: intrinsics.h:83
static Vc_INTRINSIC __m128 mm_blend_ps(__m128 a, __m128 b, const int mask)
Definition: intrinsics.h:303
static Vc_INTRINSIC __m128i mm_blend_epi16(__m128i a, __m128i b, const int mask)
Definition: intrinsics.h:360
TArc * a
Definition: textangle.C:12
static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps()
Definition: intrinsics.h:99
#define Vc_INTRINSIC
Definition: macros.h:139
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8()
Definition: intrinsics.h:87
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b)
Definition: intrinsics.h:109
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu32(__m128i a, __m128i b)
Definition: intrinsics.h:432
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi32(__m128i epi8)
Definition: intrinsics.h:465
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi32(__m128i epu8)
Definition: intrinsics.h:462
static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128()
Definition: intrinsics.h:82
static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi8(__m128i a)
Definition: intrinsics.h:161
static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps()
Definition: intrinsics.h:84
static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi32(__m128i a)
Definition: intrinsics.h:179
static Vc_INTRINSIC __m128i mm_blendv_epi8(__m128i a, __m128i b, __m128i c)
Definition: intrinsics.h:282
static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd()
Definition: intrinsics.h:98
#define Vc_MAY_ALIAS
Definition: macros.h:115
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu16(__m128i a, __m128i b)
Definition: intrinsics.h:429
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b)
Definition: intrinsics.h:111
#define Vc_PURE
Definition: macros.h:136
static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi16(__m128i a)
Definition: intrinsics.h:175
static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps()
Definition: intrinsics.h:93
ROOT::R::TRInterface & r
Definition: Object.C:4
SVector< double, 2 > v
Definition: Dict.h:5
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16()
Definition: intrinsics.h:88
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi8(__m128i a, __m128i b)
Definition: intrinsics.h:420
static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i)
Definition: intrinsics.h:524
static Vc_INTRINSIC Vc_PURE __m128i mm_stream_load_si128(__m128i *mem)
Definition: intrinsics.h:470
TMarker * m
Definition: textangle.C:8
#define Vc_CONST
Definition: macros.h:133
static Vc_INTRINSIC __m128d mm_blendv_pd(__m128d a, __m128d b, __m128d c)
Definition: intrinsics.h:276
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi16(__m128i epu8)
Definition: intrinsics.h:450
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b)
Definition: intrinsics.h:113
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu16(__m128i a, __m128i b)
Definition: intrinsics.h:438
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8()
Definition: intrinsics.h:86
static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem)
Definition: intrinsics.h:552
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi32(__m128i a, __m128i b)
Definition: intrinsics.h:447
double f(double x)
static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32()
Definition: intrinsics.h:103
static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone()
Definition: intrinsics.h:80
static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu32(__m128i a, __m128i b)
Definition: intrinsics.h:441
static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd()
Definition: intrinsics.h:96
#define SSE
Definition: global.h:84
static Vc_INTRINSIC __m128 mm_blendv_ps(__m128 a, __m128 b, __m128 c)
Definition: intrinsics.h:279
static Vc_INTRINSIC __m128i Vc_CONST mm_alignr_epi8(__m128i a, __m128i b, const int s)
Definition: intrinsics.h:186
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32()
Definition: intrinsics.h:91
static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i)
Definition: intrinsics.h:530
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi16(__m128i epi8)
Definition: intrinsics.h:453
static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a)
Definition: intrinsics.h:183
static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi32(__m128i a, __m128i b)
Definition: intrinsics.h:423
static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b)
Definition: intrinsics.h:115
Definition: casts.h:28
static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd()
Definition: intrinsics.h:94
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32()
Definition: intrinsics.h:90
static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu16_epi32(__m128i epu16)
Definition: intrinsics.h:456
static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps()
Definition: intrinsics.h:97
static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16()
Definition: intrinsics.h:102