ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
casts.h
Go to the documentation of this file.
1 /* This file is part of the Vc library.
2 
3  Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4 
5  Vc is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as
7  published by the Free Software Foundation, either version 3 of
8  the License, or (at your option) any later version.
9 
10  Vc is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 
18 */
19 
20 #ifndef AVX_CASTS_H
21 #define AVX_CASTS_H
22 
23 #include "intrinsics.h"
24 #include "types.h"
25 #include "macros.h"
26 
27 namespace ROOT {
28 namespace Vc
29 {
30 namespace AVX
31 {
32  template<typename T> static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R;
33  template<typename T> static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R;
34  template<typename T> static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R;
35  template<typename T> static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R;
36  template<typename T> static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R;
37  template<typename T> static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R;
38 
39 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
40  template<typename T> static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast<T>(param128 (v)); }
41  template<typename T> static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast<T>(param128i(v)); }
42  template<typename T> static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast<T>(param128d(v)); }
43  template<typename T> static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast<T>(param256 (v)); }
44  template<typename T> static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast<T>(param256i(v)); }
45  template<typename T> static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast<T>(param256d(v)); }
46 #endif
47 
48  // 128 -> 128
49  template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; }
50  template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); }
51  template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); }
52  template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); }
53  template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; }
54  template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); }
55  template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); }
56  template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); }
57  template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; }
58 
59  // 128 -> 256
60  // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
61  // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
62  // upper 128bits are zero. Thus using the same register as AVX register will have the upper
63  // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
64  // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
65  // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
66  // do we really want to rely on specific compiler behavior here?
67  template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); }
68  template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
69  template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
70  template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
71  template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); }
72  template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
73  template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
74  template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
75  template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); }
76 
77 #ifdef VC_MSVC
78  static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
79  static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
80  static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
81 #else
82  static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); }
83  static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); }
84  static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); }
85 #ifdef VC_ICC
86  static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
87  static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
88  static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
89 #endif
90 #endif
91 
92  // 256 -> 128
93  template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); }
94  template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
95  template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
96  template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
97  template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); }
98  template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
99  template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
100  template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
101  template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); }
102 
103  // 256 -> 256
104  template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; }
105  template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); }
106  template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); }
107  template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); }
108  template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; }
109  template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); }
110  template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); }
111  template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); }
112  template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; }
113 
114  // simplify splitting 256-bit registers in 128-bit registers
118  Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); }
119  Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); }
120  Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); }
121 
122  // simplify combining 128-bit registers in 256-bit registers
123  Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
124  Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
125  Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
126 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
127  Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
128  Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
129  Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
130  Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
131  Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
132  Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
133  Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
134  Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
135  Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
136 #endif
137 
138  template<typename From, typename To> struct StaticCastHelper {};
139  template<> struct StaticCastHelper<float , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } };
140  template<> struct StaticCastHelper<double , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
141  template<> struct StaticCastHelper<int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
142  template<> struct StaticCastHelper<unsigned int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
143  template<> struct StaticCastHelper<short , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
144  template<> struct StaticCastHelper<float , unsigned int > { static inline Vc_CONST m256i cast(param256 v) {
145  return _mm256_castps_si256(_mm256_blendv_ps(
146  _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
147  _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())),
149  ));
150 
151  } };
152  template<> struct StaticCastHelper<double , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
153  template<> struct StaticCastHelper<int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
154  template<> struct StaticCastHelper<unsigned int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
155  template<> struct StaticCastHelper<unsigned short, unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
156  template<> struct StaticCastHelper<float , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } };
157  template<> struct StaticCastHelper<double , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast<m256>(_mm256_cvtpd_ps(v)); } };
158  template<> struct StaticCastHelper<int , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } };
159  template<> struct StaticCastHelper<unsigned int , float > { static inline Vc_CONST m256 cast(param256i v) {
160  return _mm256_blendv_ps(
161  _mm256_cvtepi32_ps(v),
162  _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()),
163  _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256()))
164  );
165  } };
166  template<> struct StaticCastHelper<short , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
167  template<> struct StaticCastHelper<unsigned short, float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
168  template<> struct StaticCastHelper<float , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast<m128>(v)); } };
169  template<> struct StaticCastHelper<double , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } };
170  template<> struct StaticCastHelper<int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
171  template<> struct StaticCastHelper<unsigned int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
172  template<> struct StaticCastHelper<int , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } };
174  template<> struct StaticCastHelper<short , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
175  template<> struct StaticCastHelper<unsigned short, short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
176  template<> struct StaticCastHelper<unsigned int , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } };
178  template<> struct StaticCastHelper<short , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
179  template<> struct StaticCastHelper<unsigned short, unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
182  template<> struct StaticCastHelper<short , sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
183  template<> struct StaticCastHelper<unsigned short, sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
184 } // namespace AVX
185 } // namespace Vc
186 } // namespace ROOT
187 
188 #include "undomacros.h"
189 
190 #endif // AVX_CASTS_H
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v)
Definition: casts.h:167
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v)
Definition: casts.h:155
const m256d param256d
Definition: intrinsics.h:129
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v)
Definition: casts.h:152
static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R
Definition: casts.h:49
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v)
Definition: casts.h:140
const m128 param128
Definition: intrinsics.h:125
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v)
Definition: casts.h:180
static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32()
Definition: intrinsics.h:204
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps(param256 a, param256 b)
Definition: intrinsics.h:236
__m128i m128i
Definition: intrinsics.h:112
__m256d m256d
Definition: intrinsics.h:114
#define Vc_INTRINSIC
Definition: macros.h:139
Vc_INTRINSIC Vc_CONST m128 hi128(param256 v)
Definition: casts.h:118
TTree * T
const m256i param256i
Definition: intrinsics.h:130
Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b)
Definition: casts.h:123
#define AVX
Definition: global.h:90
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v)
Definition: casts.h:183
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v)
Definition: casts.h:157
static Vc_CONST m256i cast(param256 v)
Definition: casts.h:144
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v)
Definition: casts.h:177
static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v)
Definition: casts.h:82
SVector< double, 2 > v
Definition: Dict.h:5
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v)
Definition: casts.h:154
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v)
Definition: casts.h:182
static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps()
Definition: intrinsics.h:203
#define Vc_INTRINSIC_R
Definition: macros.h:141
__m256i m256i
Definition: intrinsics.h:115
#define Vc_CONST
Definition: macros.h:133
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v)
Definition: casts.h:143
static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v)
Definition: casts.h:170
static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v)
Definition: casts.h:171
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v)
Definition: casts.h:142
#define Vc_ALWAYS_INLINE
Definition: macros.h:130
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v)
Definition: casts.h:141
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v)
Definition: casts.h:172
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v)
Definition: casts.h:176
static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v)
Definition: casts.h:169
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v)
Definition: casts.h:179
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v)
Definition: casts.h:139
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v)
Definition: casts.h:174
#define Vc_INTRINSIC_L
Definition: macros.h:140
const m128d param128d
Definition: intrinsics.h:126
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v)
Definition: casts.h:158
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v)
Definition: casts.h:166
static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v)
Definition: casts.h:168
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v)
Definition: casts.h:173
static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v)
Definition: casts.h:153
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v)
Definition: casts.h:178
static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v)
Definition: casts.h:156
static Vc_CONST m256 cast(param256i v)
Definition: casts.h:159
const m256 param256
Definition: intrinsics.h:128
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v)
Definition: casts.h:175
static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v)
Definition: casts.h:181
__m128d m128d
Definition: intrinsics.h:111
Vc_INTRINSIC Vc_CONST m128 lo128(param256 v)
Definition: casts.h:115
const m128i param128i
Definition: intrinsics.h:127