8#ifndef SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALVECDOUBLE_H_
9#define SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALVECDOUBLE_H_
22 #if VCP_VEC_WIDTH == VCP_VEC_W__64
23 typedef double real_vec;
24 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
25 typedef __m128d real_vec;
26 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
27 typedef __m256d real_vec;
28 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
29 typedef __m512d real_vec;
40 operator real_vec()
const {
72 #if VCP_VEC_WIDTH == VCP_VEC_W__64
73 return static_cast<real_vec
>(m);
74 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
75 return _mm_castsi128_pd(m);
76 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
77 return _mm256_castsi256_pd(m);
78 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
79 return set1(0.0 / 0.0);
85 #if VCP_VEC_WIDTH == VCP_VEC_W__64
87 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
88 return _mm_castpd_si128(d);
89 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
90 return _mm256_castpd_si256(d);
91 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
98 #if VCP_VEC_WIDTH == VCP_VEC_W__64
100 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
101 return _mm_setzero_pd();
102 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
103 return _mm256_setzero_pd();
104 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
105 return _mm512_setzero_pd();
111#if VCP_VEC_WIDTH != VCP_VEC_W_512
114 return _mm512_castsi512_pd( _mm512_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0) );
120 #if VCP_VEC_WIDTH == VCP_VEC_W__64
122 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
123 return _mm_add_pd(_d, rhs);
124 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
125 return _mm256_add_pd(_d, rhs);
126 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
127 return _mm512_add_pd(_d, rhs);
133 #if VCP_VEC_WIDTH == VCP_VEC_W__64
135 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
136 return _mm_sub_pd(_d, rhs);
137 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
138 return _mm256_sub_pd(_d, rhs);
139 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
140 return _mm512_sub_pd(_d, rhs);
146 #if VCP_VEC_WIDTH == VCP_VEC_W__64
148 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
149 return _mm_mul_pd(_d, rhs);
150 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
151 return _mm256_mul_pd(_d, rhs);
152 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
153 return _mm512_mul_pd(_d, rhs);
159 #if VCP_VEC_TYPE == VCP_NOVEC or VCP_VEC_TYPE == VCP_VEC_SSE3 or VCP_VEC_TYPE == VCP_VEC_AVX
161 #elif VCP_VEC_TYPE == VCP_VEC_AVX2
162 return _mm256_fmadd_pd(a, b, c);
164 return _mm512_fmadd_pd(a, b, c);
170 #if VCP_VEC_TYPE == VCP_NOVEC or VCP_VEC_TYPE == VCP_VEC_SSE3 or VCP_VEC_TYPE == VCP_VEC_AVX
172 #elif VCP_VEC_TYPE == VCP_VEC_AVX2
173 return _mm256_fnmadd_pd(a, b, c);
175 return _mm512_fnmadd_pd(a, b, c);
181 #if VCP_VEC_TYPE == VCP_NOVEC or VCP_VEC_TYPE == VCP_VEC_SSE3 or VCP_VEC_TYPE == VCP_VEC_AVX
183 #elif VCP_VEC_TYPE == VCP_VEC_AVX2
184 return _mm256_fmsub_pd(a, b, c);
186 return _mm512_fmsub_pd(a, b, c);
192 #if VCP_VEC_WIDTH == VCP_VEC_W__64
194 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
195 return _mm_div_pd(_d, rhs);
196 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
197 return _mm256_div_pd(_d, rhs);
198 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
199 return _mm512_div_pd(_d, rhs);
205 #if VCP_VEC_WIDTH == VCP_VEC_W__64
206 return std::sqrt(rhs);
207 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
208 return _mm_sqrt_pd(rhs);
209 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
210 return _mm256_sqrt_pd(rhs);
211 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
212 return _mm512_sqrt_pd(rhs);
220 return fmadd(a1, b1, fmadd(a2, b2, a3 * b3));
224 static RealVec set1(
const double& v) {
225 #if VCP_VEC_WIDTH == VCP_VEC_W__64
227 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
228 return _mm_set1_pd(v);
229 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
230 return _mm256_set1_pd(v);
231 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
232 return _mm512_set1_pd(v);
237 static RealVec aligned_load(
const double *
const a) {
238 #if VCP_VEC_WIDTH == VCP_VEC_W__64
240 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
241 return _mm_load_pd(a);
242 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
243 return _mm256_load_pd(a);
244 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
245 return _mm512_load_pd(a);
250 static RealVec broadcast(
const double *
const a) {
251 #if VCP_VEC_WIDTH == VCP_VEC_W__64
253 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
254 return _mm_loaddup_pd(a);
255 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
256 return _mm256_broadcast_sd(a);
257 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
258 return _mm512_set1_pd(*a);
263 void aligned_store(
double * location)
const {
264 #if VCP_VEC_WIDTH == VCP_VEC_W__64
266 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
267 _mm_store_pd(location, _d);
268 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
269 _mm256_store_pd(location, _d);
270 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
271 _mm512_store_pd(location, _d);
277 #if VCP_VEC_WIDTH == VCP_VEC_W__64
279 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
280 return _mm_unpacklo_pd(a,b);
281 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
282 return _mm256_unpacklo_pd(a,b);
283 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
284 return set1(0.0 / 0.0);
290 #if VCP_VEC_WIDTH == VCP_VEC_W__64
292 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
293 return _mm_unpackhi_pd(a,b);
294 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
295 return _mm256_unpackhi_pd(a, b);
296 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
297 return set1(0.0 / 0.0);
303 #if VCP_VEC_WIDTH == VCP_VEC_W__64
305 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
306 return cast_RealCalcVec_to_MaskVec(_mm_cmplt_pd(_d, rhs));
307 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
308 return cast_RealCalcVec_to_MaskVec(_mm256_cmp_pd(_d, rhs, _CMP_LT_OS));
309 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
310 return _mm512_cmp_pd_mask(_d, rhs, _CMP_LT_OS);
316 #if VCP_VEC_WIDTH == VCP_VEC_W__64
318 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
319 return cast_RealCalcVec_to_MaskVec(_mm_cmpneq_pd(_d, rhs));
320 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
321 return cast_RealCalcVec_to_MaskVec(_mm256_cmp_pd(_d, rhs, _CMP_NEQ_OS));
322 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
323 return _mm512_cmp_pd_mask(_d, rhs, _CMP_NEQ_UQ);
329 #if VCP_VEC_TYPE == VCP_NOVEC
330 return m ? d : RealVec::zero();
331 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
332 return _mm512_mask_mov_pd(RealVec::zero(), m, d);
334 return cast_MaskVec_to_RealCalcVec(cast_RealCalcVec_to_MaskVec(d) and m);
340 #if VCP_VEC_WIDTH == VCP_VEC_W__64
341 return apply_mask(RealVec::aligned_load(a),m);
342 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
343 return apply_mask(RealVec::aligned_load(a),m);
344 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
345 return _mm256_maskload_pd(a, m);
346 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
347 return _mm512_mask_load_pd(RealVec::zero(), m, a);
352 static void horizontal_add_and_store(
const RealVec& a,
double *
const mem_addr) {
353 #if VCP_VEC_WIDTH == VCP_VEC_W__64
357 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
359 const RealVec t1 = _mm_hadd_pd(a, a);
360 const RealVec t2 = _mm_add_sd( t1, _mm_load_sd(mem_addr));
361 _mm_store_sd( mem_addr, t2);
363 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
365 static const __m256i memoryMask_first = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 1<<31, 0);
366 const RealVec a_t1 = _mm256_permute2f128_pd(a, a, 0x1);
367 const RealVec a_t2 = _mm256_hadd_pd(a, a_t1);
368 const RealVec a_t3 = _mm256_hadd_pd(a_t2, a_t2);
372 a_t3 + RealVec::aligned_load_mask(mem_addr, memoryMask_first)
375 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
377 __m256d low = _mm512_extractf64x4_pd(a, 0);
378 __m256d high = _mm512_extractf64x4_pd(a, 1);
380 __m256d t1 = _mm256_hadd_pd(low + high, low + high);
381 __m128d t2 = _mm256_extractf128_pd(t1,1);
382 __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
384 *mem_addr += _mm_cvtsd_f64(t3);
389 void aligned_load_add_store(
double * location)
const {
390 RealVec dest = aligned_load(location);
392 dest.aligned_store(location);
415 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
416 const __m128 denom_ps = _mm256_cvtpd_ps(d);
417 const __m128 recip_ps = _mm_rcp_ps(denom_ps);
419 const real_vec inv_unmasked = _mm256_cvtps_pd(recip_ps);
420 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
421 const RealVec inv = _mm512_maskz_rcp28_pd(m, d);
422 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
423 const RealVec inv = _mm512_maskz_rcp14_pd(m, d);
425 const RealVec inv_unmasked = set1(1.0) / d;
429 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
430 const RealVec inv = apply_mask(inv_unmasked, m);
432 const RealVec inv_24bits = inv * fnmadd(d, inv, set1(2.0));
433 const RealVec inv_48bits = inv_24bits * fnmadd(d, inv_24bits, set1(2.0));
434 const RealVec inv_prec = inv_48bits * fnmadd(d, inv_48bits, set1(2.0));
435 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
436 const RealVec inv_prec = inv * fnmadd(d, inv, set1(2.0));
437 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
438 const RealVec inv_28bits = inv * fnmadd(d, inv, set1(2.0));
439 const RealVec inv_prec = inv_28bits * fnmadd(d, inv_28bits, set1(2.0));
441 const RealVec inv_prec = apply_mask(inv_unmasked, m);
447 #if VCP_VEC_WIDTH == VCP_VEC_W__64
448 int nan_found = std::isnan(inv_prec) ? 1 : 0;
449 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
450 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(inv_prec, inv_prec));
451 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
452 __m128d low = _mm256_castpd256_pd128(inv_prec);
453 __m128d high = _mm256_extractf128_pd(inv_prec, 1);
454 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(low, high));
455 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
456 MaskVec<double> nan_found = _mm512_cmp_pd_mask(inv_prec, inv_prec, _CMP_UNORD_Q);
459 Log::global_log->error() <<
"NaN detected! Perhaps forceMask is wrong" << std::endl;
460 mardyn_assert(
false);
489 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
490 const __m128 denom_ps = _mm256_cvtpd_ps(d);
491 const __m128 recipSqrt_ps = _mm_rsqrt_ps(denom_ps);
493 const real_vec invSqrt_unmasked = _mm256_cvtps_pd(recipSqrt_ps);
494 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
495 const RealVec invSqrt = _mm512_maskz_rsqrt28_pd(m, d);
496 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
497 const RealVec invSqrt = _mm512_maskz_rsqrt14_pd(m, d);
499 const RealVec invSqrt_unmasked = set1(1.0) / sqrt(d);
503 #if VCP_VEC_WIDTH == VCP_VEC_W_256 and VCP_VEC_TYPE == VCP_VEC_AVX2
504 const RealVec invSqrt = apply_mask(invSqrt_unmasked, m);
506 const RealVec d2 = d * set1(0.5);
507 const RealVec invSqrt_24bits = invSqrt * fnmadd(d2, invSqrt * invSqrt, set1(1.5));
508 const RealVec invSqrt_48bits = invSqrt_24bits * fnmadd(d2, invSqrt_24bits * invSqrt_24bits, set1(1.5));
509 const RealVec invSqrt_prec = invSqrt_48bits * fnmadd(d2, invSqrt_48bits * invSqrt_48bits, set1(1.5));
510 #elif VCP_VEC_TYPE == VCP_VEC_KNL or VCP_VEC_TYPE == VCP_VEC_KNL
511 const RealVec invSqrt_prec = invSqrt * fnmadd(d * set1(0.5), invSqrt * invSqrt, set1(1.5));
512 #elif VCP_VEC_TYPE == VCP_VEC_AVX512F or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
513 const RealVec d2 = d * set1(0.5);
514 const RealVec invSqrt_28bits = invSqrt *fnmadd(d2, invSqrt * invSqrt, set1(1.5));
515 const RealVec invSqrt_prec = invSqrt_28bits *fnmadd(d2, invSqrt_28bits * invSqrt_28bits, set1(1.5));
517 const RealVec invSqrt_prec = apply_mask(invSqrt_unmasked, m);
523 #if VCP_VEC_WIDTH == VCP_VEC_W__64
524 int nan_found = std::isnan(invSqrt_prec) ? 1 : 0;
525 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
526 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(invSqrt_prec, invSqrt_prec));
527 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
528 __m128d low = _mm256_castpd256_pd128(invSqrt_prec);
529 __m128d high = _mm256_extractf128_pd(invSqrt_prec, 1);
530 int nan_found = _mm_movemask_pd(_mm_cmpunord_pd(low, high));
531 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
532 MaskVec<double> nan_found = _mm512_cmp_pd_mask(invSqrt_prec, invSqrt_prec, _CMP_UNORD_Q);
535 Log::global_log->error() <<
"NaN detected! Perhaps forceMask is wrong" << std::endl;
536 mardyn_assert(
false);
Definition: MaskVecDouble.h:18
static vcp_inline RealVec fastReciprocSqrt_mask(const RealVec &d, const MaskVec< double > &m)
Calculates 1/sqrt(d) and applies mask m on the result \detail This could also be done by directly usi...
Definition: RealVecDouble.h:486
static vcp_inline RealVec fastReciprocal_mask(const RealVec &d, const MaskVec< double > &m)
Calculates 1/d and applies mask m on the result \detail This could also be done by directly using ope...
Definition: RealVecDouble.h:412