ls1-MarDyn
ls1-MarDyn molecular dynamics code
RealVecDouble.h
1/*
2 * RealVecDouble.h
3 *
4 * Created on: 6 Feb 2018
5 * Author: tchipevn
6 */
7
8#ifndef SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALVECDOUBLE_H_
9#define SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALVECDOUBLE_H_
10
11#include "RealVec.h"
12
13// keep this file and RealVecFloat as close as possible, so that they can be examined via diff!
14
15namespace vcp {
16
17template<>
18class RealVec<double> {
19private:
20
21 // own typedefs necessary
22 #if VCP_VEC_WIDTH == VCP_VEC_W__64
23 typedef double real_vec;
24 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
25 typedef __m128d real_vec;
26 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
27 typedef __m256d real_vec;
28 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
29 typedef __m512d real_vec;
30 #endif
31
32protected:
33 real_vec _d;
34
35public:
36 vcp_inline
37 RealVec() {}
38
39 vcp_inline
40 operator real_vec() const {
41 return _d;
42 }
43
44 vcp_inline
45 RealVec(const real_vec & d) {
46 _d = d;
47 }
48
49 vcp_inline
50 RealVec(const RealVec& rhs) {
51 _d = rhs._d;
52 }
53
54 vcp_inline
55 RealVec& operator=(const RealVec& rhs) {
56 _d = rhs._d;
57 return *this;
58 }
59
60 vcp_inline
61 static RealVec convertCalcToAccum(const RealVec & rcv) {
62 return rcv;
63 }
64
65 vcp_inline
66 static RealVec convertAccumToCalc(const RealVec & rav) {
67 return rav;
68 }
69
70 vcp_inline
71 static RealVec cast_MaskVec_to_RealCalcVec(const MaskVec<double>& m) {
72 #if VCP_VEC_WIDTH == VCP_VEC_W__64
73 return static_cast<real_vec>(m);
74 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
75 return _mm_castsi128_pd(m);
76 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
77 return _mm256_castsi256_pd(m);
78 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
79 return set1(0.0 / 0.0); // do not use
80 #endif
81 }
82
83 vcp_inline
84 static MaskVec<double> cast_RealCalcVec_to_MaskVec(const RealVec& d) {
85 #if VCP_VEC_WIDTH == VCP_VEC_W__64
86 return static_cast<MaskVec<double>>(d);
87 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
88 return _mm_castpd_si128(d);
89 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
90 return _mm256_castpd_si256(d);
91 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
92 return false; // do not use
93 #endif
94 }
95
96 vcp_inline
97 static RealVec zero() {
98 #if VCP_VEC_WIDTH == VCP_VEC_W__64
99 return 0.0;
100 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
101 return _mm_setzero_pd();
102 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
103 return _mm256_setzero_pd();
104 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
105 return _mm512_setzero_pd();
106 #endif
107 }
108
109 vcp_inline
110 static RealVec ones() {
111#if VCP_VEC_WIDTH != VCP_VEC_W_512
112 return cast_MaskVec_to_RealCalcVec(MaskVec<double>::ones());
113#else
114 return _mm512_castsi512_pd( _mm512_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0) );
115#endif
116 }
117
118 vcp_inline
119 RealVec operator + (const RealVec& rhs) const {
120 #if VCP_VEC_WIDTH == VCP_VEC_W__64
121 return _d + rhs;
122 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
123 return _mm_add_pd(_d, rhs);
124 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
125 return _mm256_add_pd(_d, rhs);
126 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
127 return _mm512_add_pd(_d, rhs);
128 #endif
129 }
130
131 vcp_inline
132 RealVec operator - (const RealVec& rhs) const {
133 #if VCP_VEC_WIDTH == VCP_VEC_W__64
134 return _d - rhs;
135 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
136 return _mm_sub_pd(_d, rhs);
137 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
138 return _mm256_sub_pd(_d, rhs);
139 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
140 return _mm512_sub_pd(_d, rhs);
141 #endif
142 }
143
144 vcp_inline
145 RealVec operator * (const RealVec& rhs) const {
146 #if VCP_VEC_WIDTH == VCP_VEC_W__64
147 return _d * rhs;
148 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
149 return _mm_mul_pd(_d, rhs);
150 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
151 return _mm256_mul_pd(_d, rhs);
152 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
153 return _mm512_mul_pd(_d, rhs);
154 #endif
155 }
156
157 vcp_inline
158 static RealVec fmadd(const RealVec & a, const RealVec& b, const RealVec& c ) {
159 #if VCP_VEC_TYPE == VCP_NOVEC or VCP_VEC_TYPE == VCP_VEC_SSE3 or VCP_VEC_TYPE == VCP_VEC_AVX
160 return (a * b) + c;
161 #elif VCP_VEC_TYPE == VCP_VEC_AVX2
162 return _mm256_fmadd_pd(a, b, c);
163 #else /* KNL */
164 return _mm512_fmadd_pd(a, b, c);
165 #endif
166 }
167
168 vcp_inline
169 static RealVec fnmadd(const RealVec & a, const RealVec& b, const RealVec& c ) {
170 #if VCP_VEC_TYPE == VCP_NOVEC or VCP_VEC_TYPE == VCP_VEC_SSE3 or VCP_VEC_TYPE == VCP_VEC_AVX
171 return c - (a * b);
172 #elif VCP_VEC_TYPE == VCP_VEC_AVX2
173 return _mm256_fnmadd_pd(a, b, c);
174 #else /* KNL */
175 return _mm512_fnmadd_pd(a, b, c);
176 #endif
177 }
178
179 vcp_inline
180 static RealVec fmsub(const RealVec & a, const RealVec& b, const RealVec& c) {
181 #if VCP_VEC_TYPE == VCP_NOVEC or VCP_VEC_TYPE == VCP_VEC_SSE3 or VCP_VEC_TYPE == VCP_VEC_AVX
182 return (a * b) - c;
183 #elif VCP_VEC_TYPE == VCP_VEC_AVX2
184 return _mm256_fmsub_pd(a, b, c);
185 #else /* KNL */
186 return _mm512_fmsub_pd(a, b, c);
187 #endif
188 }
189
190 vcp_inline
191 RealVec operator / (const RealVec& rhs) const {
192 #if VCP_VEC_WIDTH == VCP_VEC_W__64
193 return _d / rhs;
194 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
195 return _mm_div_pd(_d, rhs);
196 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
197 return _mm256_div_pd(_d, rhs);
198 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
199 return _mm512_div_pd(_d, rhs);
200 #endif
201 }
202
203 vcp_inline
204 static RealVec sqrt (const RealVec& rhs) {
205 #if VCP_VEC_WIDTH == VCP_VEC_W__64
206 return std::sqrt(rhs);
207 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
208 return _mm_sqrt_pd(rhs);
209 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
210 return _mm256_sqrt_pd(rhs);
211 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
212 return _mm512_sqrt_pd(rhs);
213 #endif
214 }
215
216 vcp_inline
217 static RealVec scal_prod(
218 const RealVec& a1, const RealVec& a2, const RealVec& a3,
219 const RealVec& b1, const RealVec& b2, const RealVec& b3) {
220 return fmadd(a1, b1, fmadd(a2, b2, a3 * b3));
221 }
222
223 vcp_inline
224 static RealVec set1(const double& v) {
225 #if VCP_VEC_WIDTH == VCP_VEC_W__64
226 return v;
227 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
228 return _mm_set1_pd(v);
229 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
230 return _mm256_set1_pd(v);
231 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
232 return _mm512_set1_pd(v);
233 #endif
234 }
235
236 vcp_inline
237 static RealVec aligned_load(const double * const a) {
238 #if VCP_VEC_WIDTH == VCP_VEC_W__64
239 return *a;
240 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
241 return _mm_load_pd(a);
242 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
243 return _mm256_load_pd(a);
244 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
245 return _mm512_load_pd(a);
246 #endif
247 }
248
249 vcp_inline
250 static RealVec broadcast(const double * const a) {
251 #if VCP_VEC_WIDTH == VCP_VEC_W__64
252 return *a;
253 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
254 return _mm_loaddup_pd(a);
255 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
256 return _mm256_broadcast_sd(a);
257 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
258 return _mm512_set1_pd(*a);
259 #endif
260 }
261
262 vcp_inline
263 void aligned_store(double * location) const {
264 #if VCP_VEC_WIDTH == VCP_VEC_W__64
265 *location = _d;
266 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
267 _mm_store_pd(location, _d);
268 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
269 _mm256_store_pd(location, _d);
270 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
271 _mm512_store_pd(location, _d);
272 #endif
273 }
274
275 vcp_inline
276 static RealVec unpack_lo(const RealVec& a, const RealVec& b) {
277 #if VCP_VEC_WIDTH == VCP_VEC_W__64
278 return 0.0 / 0.0; // makes no sense
279 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
280 return _mm_unpacklo_pd(a,b);
281 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
282 return _mm256_unpacklo_pd(a,b);
283 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
284 return set1(0.0 / 0.0); // not used!
285 #endif
286 }
287
288 vcp_inline
289 static RealVec unpack_hi(const RealVec& a, const RealVec& b) {
290 #if VCP_VEC_WIDTH == VCP_VEC_W__64
291 return 0.0 / 0.0; // makes no sense
292 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
293 return _mm_unpackhi_pd(a,b);
294 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
295 return _mm256_unpackhi_pd(a, b);
296 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
297 return set1(0.0 / 0.0); // not used!
298 #endif
299 }
300
301 vcp_inline
302 MaskVec<double> operator < (const RealVec & rhs) const {
303 #if VCP_VEC_WIDTH == VCP_VEC_W__64
304 return _d < rhs;
305 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
306 return cast_RealCalcVec_to_MaskVec(_mm_cmplt_pd(_d, rhs));
307 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
308 return cast_RealCalcVec_to_MaskVec(_mm256_cmp_pd(_d, rhs, _CMP_LT_OS));
309 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
310 return _mm512_cmp_pd_mask(_d, rhs, _CMP_LT_OS);
311 #endif
312 }
313
314 vcp_inline
315 MaskVec<double> operator != (const RealVec & rhs) const {
316 #if VCP_VEC_WIDTH == VCP_VEC_W__64
317 return _d != rhs;
318 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
319 return cast_RealCalcVec_to_MaskVec(_mm_cmpneq_pd(_d, rhs));
320 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
321 return cast_RealCalcVec_to_MaskVec(_mm256_cmp_pd(_d, rhs, _CMP_NEQ_OS));
322 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
323 return _mm512_cmp_pd_mask(_d, rhs, _CMP_NEQ_UQ);
324 #endif
325 }
326
327 vcp_inline
328 static RealVec apply_mask(const RealVec& d, const MaskVec<double>& m) {
329 #if VCP_VEC_TYPE == VCP_NOVEC
330 return m ? d : RealVec::zero();
331 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
332 return _mm512_mask_mov_pd(RealVec::zero(), m, d);
333 #else // SSE, AVX, AVX2
334 return cast_MaskVec_to_RealCalcVec(cast_RealCalcVec_to_MaskVec(d) and m);
335 #endif
336 }
337
338 vcp_inline
339 static RealVec aligned_load_mask(const double * const a, MaskVec<double> m) {
340 #if VCP_VEC_WIDTH == VCP_VEC_W__64
341 return apply_mask(RealVec::aligned_load(a),m);
342 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
343 return apply_mask(RealVec::aligned_load(a),m);
344 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
345 return _mm256_maskload_pd(a, m);
346 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
347 return _mm512_mask_load_pd(RealVec::zero(), m, a);
348 #endif
349 }
350
351 vcp_inline
352 static void horizontal_add_and_store(const RealVec& a, double * const mem_addr) {
353 #if VCP_VEC_WIDTH == VCP_VEC_W__64
354 // add one double
355 (*mem_addr) += a;
356
357 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
358 // add two doubles
359 const RealVec t1 = _mm_hadd_pd(a, a);
360 const RealVec t2 = _mm_add_sd( t1, _mm_load_sd(mem_addr));
361 _mm_store_sd( mem_addr, t2);
362
363 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
364 // add four doubles
365 static const __m256i memoryMask_first = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 1<<31, 0);
366 const RealVec a_t1 = _mm256_permute2f128_pd(a, a, 0x1);
367 const RealVec a_t2 = _mm256_hadd_pd(a, a_t1);
368 const RealVec a_t3 = _mm256_hadd_pd(a_t2, a_t2);
369 _mm256_maskstore_pd(
370 mem_addr,
371 memoryMask_first,
372 a_t3 + RealVec::aligned_load_mask(mem_addr, memoryMask_first)
373 );
374
375 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
376 // add eight doubles
377 __m256d low = _mm512_extractf64x4_pd(a, 0);
378 __m256d high = _mm512_extractf64x4_pd(a, 1);
379
380 __m256d t1 = _mm256_hadd_pd(low + high, low + high);
381 __m128d t2 = _mm256_extractf128_pd(t1,1);
382 __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
383
384 *mem_addr += _mm_cvtsd_f64(t3);
385 #endif
386 }
387
388 vcp_inline
389 void aligned_load_add_store(double * location) const {
390 RealVec dest = aligned_load(location);
391 dest = dest + *this;
392 dest.aligned_store(location);
393 }
394
401 // Newton-Raphson division:
402 // take the _mm*_rcp version with highest bit-precision
403 // and iterate until the number of bits is >53 (DPDP) or >22 (SPSP, SPDP)
404 //
405 // An iteration of Newton-Raphson looks like this:
406 // approximation * fnmadd(original_value, approximation, set1(2.0))
407 //
408 // AVX2 - some speed-up for single precision. For double prec slow down. Presumably because there are no dedicated fast intrinsics for packed double
409 // AVX - slow-down: fma is not available; even more pressure on multiply-add imbalance? Leaving it out
410 // KNL - an educated guess would assume that AVX512ER is there for a reason :)
411 vcp_inline
413
414 /* reciprocal */
415 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
416 const __m128 denom_ps = _mm256_cvtpd_ps(d);
417 const __m128 recip_ps = _mm_rcp_ps(denom_ps);
418
419 const real_vec inv_unmasked = _mm256_cvtps_pd(recip_ps); //12bit
420 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
421 const RealVec inv = _mm512_maskz_rcp28_pd(m, d); //28bit
422 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
423 const RealVec inv = _mm512_maskz_rcp14_pd(m, d); //14bit
424 #else /* VCP_VEC_WIDTH == 64/128/256AVX */
425 const RealVec inv_unmasked = set1(1.0) / d;
426 #endif
427
428 /* mask and/or N-R-iterations if necessary */
429 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
430 const RealVec inv = apply_mask(inv_unmasked, m); //12bit
431
432 const RealVec inv_24bits = inv * fnmadd(d, inv, set1(2.0)); //24bit, 1. N-R-Iteration
433 const RealVec inv_48bits = inv_24bits * fnmadd(d, inv_24bits, set1(2.0)); //48bit, 2. N-R-Iteration
434 const RealVec inv_prec = inv_48bits * fnmadd(d, inv_48bits, set1(2.0)); //96bit, 3. N-R-Iteration
435 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
436 const RealVec inv_prec = inv * fnmadd(d, inv, set1(2.0)); //56bit, 1 N-R-Iteration
437 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
438 const RealVec inv_28bits = inv * fnmadd(d, inv, set1(2.0)); //28bit, 1. N-R-Iteration
439 const RealVec inv_prec = inv_28bits * fnmadd(d, inv_28bits, set1(2.0)); //56bit, 2. N-R-Iteration
440 #else /* VCP_VEC_WIDTH == 64/128/256AVX */
441 const RealVec inv_prec = apply_mask(inv_unmasked, m);
442 #endif
443
444 /* check for NaNs */
445 #ifndef NDEBUG
446 //set nan_found to zero only if NO NaN is found
447 #if VCP_VEC_WIDTH == VCP_VEC_W__64
448 int nan_found = std::isnan(inv_prec) ? 1 : 0;
449 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
450 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(inv_prec, inv_prec));
451 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
452 __m128d low = _mm256_castpd256_pd128(inv_prec);
453 __m128d high = _mm256_extractf128_pd(inv_prec, 1);
454 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(low, high));
455 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
456 MaskVec<double> nan_found = _mm512_cmp_pd_mask(inv_prec, inv_prec, _CMP_UNORD_Q);
457 #endif
458 if (nan_found) {
459 Log::global_log->error() << "NaN detected! Perhaps forceMask is wrong" << std::endl;
460 mardyn_assert(false);
461 }
462 #endif
463
464 return inv_prec;
465 } //fastReciprocal_mask(..)
466
474 // Newton-Raphson division:
475 // take the _mm*_rsqrt version with highest bit-precision
476 // and iterate until the number of bits is >53 (DPDP) or >22 (SPSP, SPDP)
477 //
478 // An iteration of Newton-Raphson looks like this:
479 // y(n+1) = y(n) * (1.5 - (d / 2) * y(n)²) or
480 // approximation * fnmadd(original_value * 0.5, approximation * approximation, set1(1.5))
481 //
482 // AVX2 - AVX2 - some speed-up for single precision. For double prec slow down. Presumably because there are no dedicated fast intrinsics for packed double
483 // AVX - not supported
484 // KNL - an educated guess would assume that AVX512ER is there for a reason :)
485 vcp_inline
487
488 /* reciprocal sqrt */
489 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
490 const __m128 denom_ps = _mm256_cvtpd_ps(d);
491 const __m128 recipSqrt_ps = _mm_rsqrt_ps(denom_ps);
492
493 const real_vec invSqrt_unmasked = _mm256_cvtps_pd(recipSqrt_ps); //12bit
494 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
495 const RealVec invSqrt = _mm512_maskz_rsqrt28_pd(m, d); //28bit
496 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
497 const RealVec invSqrt = _mm512_maskz_rsqrt14_pd(m, d); //14bit
498 #else /* VCP_VEC_WIDTH == 64/128/256AVX */
499 const RealVec invSqrt_unmasked = set1(1.0) / sqrt(d);
500 #endif
501
502 /* mask and/or N-R-iterations if necessary */
503 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
504 const RealVec invSqrt = apply_mask(invSqrt_unmasked, m); //12bit
505
506 const RealVec d2 = d * set1(0.5);
507 const RealVec invSqrt_24bits = invSqrt * fnmadd(d2, invSqrt * invSqrt, set1(1.5)); //24bit, 1. N-R-Iteration
508 const RealVec invSqrt_48bits = invSqrt_24bits * fnmadd(d2, invSqrt_24bits * invSqrt_24bits, set1(1.5)); //48bit, 2. N-R-Iteration
509 const RealVec invSqrt_prec = invSqrt_48bits * fnmadd(d2, invSqrt_48bits * invSqrt_48bits, set1(1.5)); //96bit, 3. N-R-Iteration
510 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
511 const RealVec invSqrt_prec = invSqrt * fnmadd(d * set1(0.5), invSqrt * invSqrt, set1(1.5)); //56bit, 1 N-R-Iteration
512 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
513 const RealVec d2 = d * set1(0.5);
514 const RealVec invSqrt_28bits = invSqrt *fnmadd(d2, invSqrt * invSqrt, set1(1.5)); // 28bit, 1 N-R-Iteration
515 const RealVec invSqrt_prec = invSqrt_28bits *fnmadd(d2, invSqrt_28bits * invSqrt_28bits, set1(1.5)); // 56bit, 2 N-R-Iteration
516 #else /* VCP_VEC_WIDTH == 64/128/256AVX */
517 const RealVec invSqrt_prec = apply_mask(invSqrt_unmasked, m);
518 #endif
519
520 /* check for NaNs */
521 #ifndef NDEBUG
522 //set nan_found to zero only if NO NaN is found
523 #if VCP_VEC_WIDTH == VCP_VEC_W__64
524 int nan_found = std::isnan(invSqrt_prec) ? 1 : 0;
525 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
526 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(invSqrt_prec, invSqrt_prec));
527 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
528 __m128d low = _mm256_castpd256_pd128(invSqrt_prec);
529 __m128d high = _mm256_extractf128_pd(invSqrt_prec, 1);
530 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(low, high));
531 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
532 MaskVec<double> nan_found = _mm512_cmp_pd_mask(invSqrt_prec, invSqrt_prec, _CMP_UNORD_Q);
533 #endif
534 if (nan_found) {
535 Log::global_log->error() << "NaN detected! Perhaps forceMask is wrong" << std::endl;
536 mardyn_assert(false);
537 }
538 #endif
539
540 return invSqrt_prec;
541 } //fastReciprocSqrt_mask(..)
542
543}; /* class RealCalcVec */
544
545} /* namespace vcp */
546
547#endif /* SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALVECDOUBLE_H_ */
Definition: MaskVecDouble.h:18
Definition: MaskVec.h:16
static vcp_inline RealVec fastReciprocSqrt_mask(const RealVec &d, const MaskVec< double > &m)
Calculates 1/sqrt(d) and applies mask m on the result \detail This could also be done by directly usi...
Definition: RealVecDouble.h:486
static vcp_inline RealVec fastReciprocal_mask(const RealVec &d, const MaskVec< double > &m)
Calculates 1/d and applies mask m on the result \detail This could also be done by directly using ope...
Definition: RealVecDouble.h:412
Definition: RealVec.h:22