ls1-MarDyn
ls1-MarDyn molecular dynamics code
RealAccumVecSPDP.h
1/*
2 * RealAccumVec.h
3 *
4 * Created on: 7 Feb 2018
5 * Author: tchipevn
6 */
7
8#ifndef SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALACCUMVECSPDP_H_
9#define SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALACCUMVECSPDP_H_
10
11#include "RealVec.h"
12
13namespace vcp {
14
15#if VCP_VEC_WIDTH != VCP_VEC_W__64
16// the novec case is handled differently, as it requires only one RealVec<double> to store its results.
17
18class RealAccumVecSPDP {
19
20private:
21 RealVec<double> _first;
22 RealVec<double> _second;
23
24public:
25 vcp_inline
26 RealAccumVecSPDP() {}
27
28 vcp_inline
29 static RealAccumVecSPDP convertCalcToAccum(const RealCalcVec & rcv) {
30 RealVec<double> first = convert_low(rcv);
31 RealVec<double> second = convert_high(rcv);
32 return RealAccumVecSPDP(first, second);
33 }
34
35 vcp_inline
36 static RealCalcVec convertAccumToCalc(const RealAccumVecSPDP & rav) {
37 RealCalcVec ret = back_convert(rav._first, rav._second);
38 return ret;
39 }
40
41 vcp_inline
42 RealAccumVecSPDP(const RealAccumVecSPDP& rhs) {
43 _first = rhs._first;
44 _second = rhs._second;
45 }
46
47 vcp_inline
48 RealAccumVecSPDP(const RealVec<double>& first, const RealVec<double>& second) {
49 _first = first;
50 _second = second;
51 }
52
53 vcp_inline
54 RealAccumVecSPDP& operator=(const RealAccumVecSPDP& rhs) {
55 _first = rhs._first;
56 _second = rhs._second;
57 return *this;
58 }
59
60 vcp_inline
61 static RealAccumVecSPDP zero() {
62 RealAccumVecSPDP result;
63 result._first = RealVec<double>::zero();
64 result._second = RealVec<double>::zero();
65 return result;
66 }
67
68 vcp_inline
69 RealAccumVecSPDP operator+(const RealAccumVecSPDP& rhs) const {
70 RealAccumVecSPDP result;
71 result._first = _first + rhs._first;
72 result._second = _second + rhs._second;
73 return result;
74 }
75
76 vcp_inline
77 RealAccumVecSPDP operator*(const RealAccumVecSPDP& rhs) const {
78 RealAccumVecSPDP result;
79 result._first = _first * rhs._first;
80 result._second = _second * rhs._second;
81 return result;
82 }
83
84 vcp_inline
85 RealAccumVecSPDP operator-(const RealAccumVecSPDP& rhs) const {
86 RealAccumVecSPDP result;
87 result._first = _first - rhs._first;
88 result._second = _second - rhs._second;
89 return result;
90 }
91
92 vcp_inline
93 static RealAccumVecSPDP fmadd(const RealAccumVecSPDP & a, const RealAccumVecSPDP& b, const RealAccumVecSPDP& c ) {
94 RealAccumVecSPDP result;
95 result._first = RealVec<double>::fmadd(a._first, b._first, c._first);
96 result._second = RealVec<double>::fmadd(a._second, b._second, c._second);
97 return result;
98 }
99
100 vcp_inline
101 static RealAccumVecSPDP fnmadd(const RealAccumVecSPDP & a, const RealAccumVecSPDP& b, const RealAccumVecSPDP& c ) {
102 RealAccumVecSPDP result;
103 result._first = RealVec<double>::fnmadd(a._first, b._first, c._first);
104 result._second = RealVec<double>::fnmadd(a._second, b._second, c._second);
105 return result;
106 }
107
108 vcp_inline
109 void aligned_store(double * location) const {
110 const size_t offset = sizeof(RealVec<double>) / sizeof(double);
111 _first.aligned_store(location);
112 _second.aligned_store(location + offset);
113 }
114
115 vcp_inline
116 static RealAccumVecSPDP aligned_load(const double * const a) {
117 const size_t offset = sizeof(RealVec<double>) / sizeof(double);
118 RealVec<double> first = RealVec<double>::aligned_load(a);
119 RealVec<double> second = RealVec<double>::aligned_load(a + offset);
120 return RealAccumVecSPDP(first, second);
121 }
122
123 vcp_inline
124 static RealAccumVecSPDP aligned_load_mask(const double * const a, MaskVec<float> m) {
125 // we need to make two masks of type MaskVec<double> from one MaskVec<float>
126 MaskVec<double> m_lo, m_hi;
127 convert_mask_vec(m, m_lo, m_hi);
128
129 const size_t offset = sizeof(RealVec<double>) / sizeof(double);
130
131 RealVec<double> first = RealVec<double>::aligned_load_mask(a, m_lo);
132 RealVec<double> second = RealVec<double>::aligned_load_mask(a + offset, m_hi);
133
134 return RealAccumVecSPDP(first, second);
135 }
136
137 vcp_inline
138 static RealAccumVecSPDP set1(const double& v) {
139 RealVec<double> first = RealVec<double>::set1(v);
140 RealVec<double> second = RealVec<double>::set1(v);
141 return RealAccumVecSPDP(first, second);
142 }
143
144 vcp_inline
145 static void horizontal_add_and_store(const RealAccumVecSPDP& a, double * const mem_addr) {
146 RealVec<double> sum = a._first + a._second;
147 RealVec<double>::horizontal_add_and_store(sum, mem_addr);
148 }
149
150 vcp_inline
151 void aligned_load_add_store(double * location) const {
152 const size_t offset = sizeof(RealVec<double>) / sizeof(double);
153 _first.aligned_load_add_store(location);
154 _second.aligned_load_add_store(location + offset);
155 }
156
157 vcp_inline
158 static RealAccumVecSPDP scal_prod(
159 const RealAccumVecSPDP& a1, const RealAccumVecSPDP& a2, const RealAccumVecSPDP& a3,
160 const RealAccumVecSPDP& b1, const RealAccumVecSPDP& b2, const RealAccumVecSPDP& b3) {
161 return fmadd(a1, b1, fmadd(a2, b2, a3 * b3));
162 }
163
164
165#if VCP_VEC_TYPE == VCP_VEC_KNL_GATHER or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
166 vcp_inline
167 static RealAccumVecSPDP gather_load(const double * const src, const size_t& offset, const vcp_lookupOrMask_vec& lookup) {
168 __m256i lookup_256i_lo = _mm512_extracti64x4_epi64(lookup, 0);
169 __m256i lookup_256i_hi = _mm512_extracti64x4_epi64(lookup, 1);
170 RealVec<double> first (_mm512_i32gather_pd(lookup_256i_lo, src, 8));
171 RealVec<double> second (_mm512_i32gather_pd(lookup_256i_hi, src, 8));
172 return RealAccumVecSPDP(first, second);
173 }
174
175 vcp_inline
176 void scatter_store(double* const addr, const size_t& offset, const vcp_lookupOrMask_vec& lookup) {
177 __m256i lookup_256i_lo = _mm512_extracti64x4_epi64(lookup, 0);
178 __m256i lookup_256i_hi = _mm512_extracti64x4_epi64(lookup, 1);
179 _mm512_i32scatter_pd(addr, lookup_256i_lo, _first, 8);
180 _mm512_i32scatter_pd(addr, lookup_256i_hi, _second, 8);
181 }
182
183 vcp_inline
184 void scatter_store_mask(double* const addr, const size_t& offset, const vcp_lookupOrMask_vec& lookup, const MaskVec<float>& mask) {
185 __m256i lookup_256i_lo = _mm512_extracti64x4_epi64(lookup, 0);
186 __m256i lookup_256i_hi = _mm512_extracti64x4_epi64(lookup, 1);
187 MaskVec<double> m_lo, m_hi;
188 convert_mask_vec(mask, m_lo, m_hi);
189 _mm512_mask_i32scatter_pd(addr, m_lo, lookup_256i_lo, _first, 8);
190 _mm512_mask_i32scatter_pd(addr, m_hi, lookup_256i_hi, _second, 8);
191 }
192#endif
193
194 vcp_inline
195 static RealVec<double> convert_low(const RealCalcVec& rhs) {
196 #if VCP_VEC_WIDTH == VCP_VEC_W__64
197 line not compiled
198 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
199 return _mm_cvtps_pd(rhs);
200 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
201 return _mm256_cvtps_pd(_mm256_extractf128_ps(rhs, 0));
202 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
203 return _mm512_cvtps_pd(_mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(rhs), 0)));
204 #endif
205 }
206
207 vcp_inline
208 static RealVec<double> convert_high(const RealCalcVec& rhs) {
209 #if VCP_VEC_WIDTH == VCP_VEC_W__64
210 line not compiled
211 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
212 return _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(rhs), 8)));
213 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
214 return _mm256_cvtps_pd(_mm256_extractf128_ps(rhs, 1));
215 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
216 return _mm512_cvtps_pd(_mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(rhs), 1)));
217 #endif
218 }
219
220 vcp_inline
221 static RealCalcVec back_convert(const RealVec<double>& lo, const RealVec<double>& hi) {
222 #if VCP_VEC_WIDTH == VCP_VEC_W__64
223 line not compiled
224 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
225 __m128 c_lo = _mm_cvtpd_ps(lo);
226 __m128 c_hi = _mm_cvtpd_ps(hi);
227 return _mm_movelh_ps(c_lo, c_hi);
228 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
229 __m128 c_lo = _mm256_cvtpd_ps(lo);
230 __m128 c_hi = _mm256_cvtpd_ps(hi);
231
232 __m256 ret = _mm256_castps128_ps256(c_lo);
233 ret = _mm256_insertf128_ps(ret, c_hi, 1);
234
235 return ret;
236 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
237 __m256 c_lo = _mm512_cvtpd_ps(lo);
238 __m256 c_hi = _mm512_cvtpd_ps(hi);
239
240 __m512 ret = _mm512_castps256_ps512(c_lo);
241 ret = _mm512_insertf32x8(ret, c_hi, 1);
242
243 return ret;
244 #endif
245 }
246
247 vcp_inline
248 static void convert_mask_vec(const MaskVec<float>& src, MaskVec<double>& lo, MaskVec<double>& hi) {
249 #if VCP_VEC_WIDTH == VCP_VEC_W__64
250 line not compiled
251 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
252
253 lo = _mm_unpacklo_epi32(src, src);
254 hi = _mm_unpackhi_epi32(src, src);
255
256 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
257 __m256 v_3210 = _mm256_castsi256_ps(src);
258
259 __m256i v_2200 = _mm256_castps_si256(_mm256_unpacklo_ps(v_3210, v_3210));
260 __m256i v_3311 = _mm256_castps_si256(_mm256_unpackhi_ps(v_3210, v_3210));
261
262 // need to swap 11 and 22
263 auto v_10 = _mm256_extract_epi64 (v_3311, 0);
264 auto v_11 = _mm256_extract_epi64 (v_3311, 1);
265
266 auto v_20 = _mm256_extract_epi64 (v_2200, 2);
267 auto v_21 = _mm256_extract_epi64 (v_2200, 3);
268
269 __m256i v_2100 = _mm256_insert_epi64 (v_2200, v_10, 2);
270 __m256i v_1100 = _mm256_insert_epi64 (v_2100, v_11, 3);
271 lo = v_1100;
272
273 __m256i v_3312 = _mm256_insert_epi64 (v_3311, v_20, 0);
274 __m256i v_3322 = _mm256_insert_epi64 (v_3312, v_21, 1);
275 hi = v_3322;
276
277 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
278 // need to make two __mmask8 from one __mmask16
279 // the intrinsics are not very helpful for working with mmask*, so...
280 union {
281 __mmask16 _wide;
282 __mmask8 _narrow[2];
283 } merged;
284 merged._wide = src;
285 lo = merged._narrow[0];
286 hi = merged._narrow[1];
287
288 #endif
289 }
290};
291
292#elif VCP_VEC_WIDTH == VCP_VEC_W__64
293
294class RealAccumVecSPDP : public RealVec<double> {
295public:
296 vcp_inline
298
299 vcp_inline
301 this->_d = rcv;
302 }
303
304 vcp_inline
305 static RealAccumVecSPDP convertCalcToAccum(const RealCalcVec & rcv) {
306 RealAccumVecSPDP result;
307 result._d = rcv;
308 return result;
309 }
310
311 vcp_inline
312 static RealCalcVec convertAccumToCalc(const RealAccumVecSPDP & rav) {
313 RealCalcVec result(rav);
314 return result;
315 }
316
317 vcp_inline
318 static RealAccumVecSPDP aligned_load_mask(const double * const a, MaskVec<float> m) {
319 return apply_mask(aligned_load(a),MaskVec<double>(m));
320 }
321};
322
323#endif /* VCP_VEC_WIDTH */
324
325} /* namespace vcp */
326
327#endif /* SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALACCUMVECSPDP_H_ */
Definition: MaskVecDouble.h:18
Definition: MaskVecFloat.h:18
Definition: RealAccumVecSPDP.h:294
Definition: RealVecDouble.h:18
Definition: RealVec.h:22