ROOT  6.06/09
Reference Guide
shuffle.h
Go to the documentation of this file.
1 /* This file is part of the Vc library.
2 
3  Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
4 
5  Vc is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as
7  published by the Free Software Foundation, either version 3 of
8  the License, or (at your option) any later version.
9 
10  Vc is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 
18 */
19 
20 #ifndef VC_SSE_SHUFFLE_H
21 #define VC_SSE_SHUFFLE_H
22 
23 #include "macros.h"
24 
25 namespace ROOT {
26 namespace Vc
27 {
28  enum VecPos {
29  X0, X1, X2, X3, X4, X5, X6, X7,
30  Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
31  };
32 
33  namespace Mem
34  {
35  // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
36  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
37  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
38  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
39  return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
40  }
41 
42  // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
43  template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
44  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range);
45  VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range);
46  return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
47  }
48 
49 #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
50 #define Vc_MAKE_INTRINSIC__(name__) Vc::SSE::_VC_CAT(m,m,_,name__)
51 #else
52 #define Vc_MAKE_INTRINSIC__(name__) _VC_CAT(_,mm,_,name__)
53 #endif
54 
55  // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
56  template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
57  VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
58  VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
59  return Vc_MAKE_INTRINSIC__(blend_pd)(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2);
60  }
61 
62  // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
63  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
64  VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
65  VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
66  VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
67  VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
68  return Vc_MAKE_INTRINSIC__(blend_ps)(x, y,
69  (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
70  (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8);
71  }
72 
73  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
74  static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
75  VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
76  VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
77  VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
78  VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
79  VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
80  VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
81  VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
82  VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
83  return Vc_MAKE_INTRINSIC__(blend_epi16)(x, y,
84  (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
85  (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
86  (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
87  (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
88  );
89  }
90 
91  // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
92  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
93  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
94  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
95  return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
96  }
97 
98  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
99  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
100  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
101  return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
102  }
103 
104  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
105  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
106  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
107  return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
108  }
109 
110  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
111  VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range);
112  VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range);
113  return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
114  }
115 
116  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
117  static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
118  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
119  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
120  VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range);
121  VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range);
122  if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
123  x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
124  }
125  if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
126  x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
127  }
128  return x;
129  }
130  } // namespace Mem
131  // The shuffles and permutes above use memory ordering. The ones below use register ordering:
132  namespace Reg
133  {
134  // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
135  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
136  return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
137  }
138 
139  // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
140  template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
141  return Mem::shuffle<Dst0, Dst1>(x, y);
142  }
143 
144  // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
145  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
146  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
147  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
148  return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
149  }
150 
151  // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
152  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
153  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
154  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
155  return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
156  }
157 
158  // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
159  template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
160  return Mem::blend<Dst0, Dst1>(x, y);
161  }
162 
163  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
164  return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
165  }
166  } // namespace Reg
167 } // namespace Vc
168 } // namespace ROOT
169 
170 #include "undomacros.h"
171 
172 #endif // VC_SSE_SHUFFLE_H
static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y)
Definition: shuffle.h:97
Namespace for new ROOT classes and functions.
Definition: ROOT.py:1
Float_t Mem()
Definition: kDTreeTest.cxx:35
static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x)
Definition: shuffle.h:73
static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x)
Definition: shuffle.h:204
#define Vc_MAKE_INTRINSIC__(name__)
Definition: shuffle.h:50
static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x)
Definition: shuffle.h:110
Double_t x[n]
Definition: legend1.C:17
#define Vc_CONST
Definition: macros.h:133
static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y)
Definition: shuffle.h:224
#define Vc_ALWAYS_INLINE
Definition: macros.h:130
Double_t y[n]
Definition: legend1.C:17
static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y)
Definition: shuffle.h:86
#define VC_STATIC_ASSERT(cond, msg)
Definition: macros.h:246
Definition: casts.h:28
static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y)
Definition: shuffle.h:159
static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x)
Definition: shuffle.h:104