ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
shuffle.h
Go to the documentation of this file.
1 /* This file is part of the Vc library.
2 
3  Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
4 
5  Vc is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as
7  published by the Free Software Foundation, either version 3 of
8  the License, or (at your option) any later version.
9 
10  Vc is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 
18 */
19 
20 #ifndef VC_AVX_SHUFFLE_H
21 #define VC_AVX_SHUFFLE_H
22 
23 #include "../sse/shuffle.h"
24 #include "macros.h"
25 
26 namespace ROOT {
27 namespace Vc
28 {
29  using AVX::m128;
30  using AVX::m128d;
31  using AVX::m128i;
32  using AVX::m256;
33  using AVX::m256d;
34  using AVX::m256i;
35  using AVX::param128;
36  using AVX::param128d;
37  using AVX::param128i;
38  using AVX::param256;
39  using AVX::param256d;
40  using AVX::param256i;
41  namespace Mem
42  {
43  template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) {
44  VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
45  VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
46  return _mm256_permute2f128_ps(x, x, L + H * (1 << 4));
47  }
48  template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) {
49  VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
50  VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
51  return _mm256_permute2f128_pd(x, x, L + H * (1 << 4));
52  }
53  template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) {
54  VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
55  VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
56  return _mm256_permute2f128_si256(x, x, L + H * (1 << 4));
57  }
58  template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) {
59  VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
60  VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
61  return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
62  }
63  template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) {
64  VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
65  VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
66  return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
67  }
68  template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) {
69  VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
70  VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
71  return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
72  }
73  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
74  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
75  VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
76  return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
77  }
78  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
79  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
80  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
81  return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
82  }
83  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) {
84  return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
85  }
86  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
87  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
88  VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
89  return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
90  }
91  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
92  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
93  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
94  return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
95  }
96  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
98  VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
99  VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
100  VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
101  VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
102  VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
103  VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
104  VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
105  VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
106  return _mm256_blend_ps(x, y,
107  (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
108  (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
109  (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
110  (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
111  );
112  }
113  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
115  return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
116  }
117  template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
118  template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
120  VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range);
121  VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range);
122  VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range);
123  VC_STATIC_ASSERT(Dst3 >= X0 && Dst3 <= X7, Incorrect_Range);
124  VC_STATIC_ASSERT(Dst4 >= X0 && Dst4 <= X7, Incorrect_Range);
125  VC_STATIC_ASSERT(Dst5 >= X0 && Dst5 <= X7, Incorrect_Range);
126  VC_STATIC_ASSERT(Dst6 >= X0 && Dst6 <= X7, Incorrect_Range);
127  VC_STATIC_ASSERT(Dst7 >= X0 && Dst7 <= X7, Incorrect_Range);
128  if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
129  return permute<Dst0, Dst1, Dst2, Dst3>(x);
130  }
131  const m128 loIn = _mm256_castps256_ps128(x);
132  const m128 hiIn = _mm256_extractf128_ps(x, 1);
133  m128 lo, hi;
134 
135  if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
136  lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
137  } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
138  lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
139  } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
140  lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
141  } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
142  lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
143  } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
144  lo = _mm_unpacklo_ps(loIn, hiIn);
145  } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
146  lo = _mm_unpacklo_ps(hiIn, loIn);
147  } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
148  lo = _mm_unpackhi_ps(loIn, hiIn);
149  } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
150  lo = _mm_unpackhi_ps(hiIn, loIn);
151  } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
154  }
155 
156  if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
157  hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
158  } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
159  hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
160  } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
161  hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
162  } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
163  hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
164  } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
165  hi = _mm_unpacklo_ps(loIn, hiIn);
166  } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
167  hi = _mm_unpacklo_ps(hiIn, loIn);
168  } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
169  hi = _mm_unpackhi_ps(loIn, hiIn);
170  } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
171  hi = _mm_unpackhi_ps(hiIn, loIn);
172  } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
175  }
176 
177  return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
178  }
179  } // namespace Mem
180 
181  // little endian has the lo bits on the right and high bits on the left
182  // with vectors this becomes greatly confusing:
183  // Mem: abcd
184  // Reg: dcba
185  //
186  // The shuffles and permutes above use memory ordering. The ones below use register ordering:
187  namespace Reg
188  {
189  template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) {
190  VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
191  VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
192  return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
193  }
194  template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) {
195  VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
196  VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
197  return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
198  }
199  template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) {
200  VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
201  VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
202  return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
203  }
204  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
205  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
206  VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
207  return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
208  }
209  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
210  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
211  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
212  return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
213  }
214  template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) {
215  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range);
216  VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range);
217  return _mm_permute_pd(x, Dst0 + Dst1 * 2);
218  }
219  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) {
220  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
221  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
222  return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
223  }
224  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
225  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
226  VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
227  return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
228  }
229  template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
230  VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
231  VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
232  return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
233  }
234  } // namespace Reg
235 } // namespace Vc
236 } // namespace ROOT
237 #include "undomacros.h"
238 
239 #endif // VC_AVX_SHUFFLE_H
static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y)
Definition: shuffle.h:97
const m256d param256d
Definition: intrinsics.h:129
Float_t Mem()
Definition: kDTreeTest.cxx:35
RooArgList L(const RooAbsArg &v1)
static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x)
Definition: shuffle.h:73
const m128 param128
Definition: intrinsics.h:125
#define H(x, y, z)
__m128i m128i
Definition: intrinsics.h:112
__m256d m256d
Definition: intrinsics.h:114
static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x)
Definition: shuffle.h:204
Double_t x[n]
Definition: legend1.C:17
static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x)
Definition: shuffle.h:43
const m256i param256i
Definition: intrinsics.h:130
__m256i m256i
Definition: intrinsics.h:115
#define Vc_CONST
Definition: macros.h:133
static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y)
Definition: shuffle.h:58
static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y)
Definition: shuffle.h:224
#define Vc_ALWAYS_INLINE
Definition: macros.h:130
Double_t y[n]
Definition: legend1.C:17
const m128d param128d
Definition: intrinsics.h:126
static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y)
Definition: shuffle.h:86
#define VC_STATIC_ASSERT(cond, msg)
Definition: macros.h:246
float type_of_call hi(const int &, const int &)
static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y)
Definition: shuffle.h:189
const m256 param256
Definition: intrinsics.h:128
const char * Value
Definition: TXMLSetup.cxx:73
__m128d m128d
Definition: intrinsics.h:111
const m128i param128i
Definition: intrinsics.h:127