8#ifndef SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALACCUMVECSPDP_H_
9#define SRC_PARTICLECONTAINER_ADAPTER_VECTORIZATION_REALACCUMVECSPDP_H_
15#if VCP_VEC_WIDTH != VCP_VEC_W__64
18class RealAccumVecSPDP {
21 RealVec<double> _first;
22 RealVec<double> _second;
29 static RealAccumVecSPDP convertCalcToAccum(
const RealCalcVec & rcv) {
30 RealVec<double> first = convert_low(rcv);
31 RealVec<double> second = convert_high(rcv);
32 return RealAccumVecSPDP(first, second);
36 static RealCalcVec convertAccumToCalc(
const RealAccumVecSPDP & rav) {
37 RealCalcVec ret = back_convert(rav._first, rav._second);
42 RealAccumVecSPDP(
const RealAccumVecSPDP& rhs) {
44 _second = rhs._second;
48 RealAccumVecSPDP(
const RealVec<double>& first,
const RealVec<double>& second) {
54 RealAccumVecSPDP& operator=(
const RealAccumVecSPDP& rhs) {
56 _second = rhs._second;
61 static RealAccumVecSPDP zero() {
62 RealAccumVecSPDP result;
63 result._first = RealVec<double>::zero();
64 result._second = RealVec<double>::zero();
69 RealAccumVecSPDP operator+(
const RealAccumVecSPDP& rhs)
const {
70 RealAccumVecSPDP result;
71 result._first = _first + rhs._first;
72 result._second = _second + rhs._second;
77 RealAccumVecSPDP operator*(
const RealAccumVecSPDP& rhs)
const {
78 RealAccumVecSPDP result;
79 result._first = _first * rhs._first;
80 result._second = _second * rhs._second;
85 RealAccumVecSPDP operator-(
const RealAccumVecSPDP& rhs)
const {
86 RealAccumVecSPDP result;
87 result._first = _first - rhs._first;
88 result._second = _second - rhs._second;
93 static RealAccumVecSPDP fmadd(
const RealAccumVecSPDP & a,
const RealAccumVecSPDP& b,
const RealAccumVecSPDP& c ) {
94 RealAccumVecSPDP result;
95 result._first = RealVec<double>::fmadd(a._first, b._first, c._first);
96 result._second = RealVec<double>::fmadd(a._second, b._second, c._second);
101 static RealAccumVecSPDP fnmadd(
const RealAccumVecSPDP & a,
const RealAccumVecSPDP& b,
const RealAccumVecSPDP& c ) {
102 RealAccumVecSPDP result;
103 result._first = RealVec<double>::fnmadd(a._first, b._first, c._first);
104 result._second = RealVec<double>::fnmadd(a._second, b._second, c._second);
109 void aligned_store(
double * location)
const {
110 const size_t offset =
sizeof(RealVec<double>) /
sizeof(
double);
111 _first.aligned_store(location);
112 _second.aligned_store(location + offset);
116 static RealAccumVecSPDP aligned_load(
const double *
const a) {
117 const size_t offset =
sizeof(RealVec<double>) /
sizeof(
double);
118 RealVec<double> first = RealVec<double>::aligned_load(a);
119 RealVec<double> second = RealVec<double>::aligned_load(a + offset);
120 return RealAccumVecSPDP(first, second);
124 static RealAccumVecSPDP aligned_load_mask(
const double *
const a, MaskVec<float> m) {
126 MaskVec<double> m_lo, m_hi;
127 convert_mask_vec(m, m_lo, m_hi);
129 const size_t offset =
sizeof(RealVec<double>) /
sizeof(
double);
131 RealVec<double> first = RealVec<double>::aligned_load_mask(a, m_lo);
132 RealVec<double> second = RealVec<double>::aligned_load_mask(a + offset, m_hi);
134 return RealAccumVecSPDP(first, second);
138 static RealAccumVecSPDP set1(
const double& v) {
139 RealVec<double> first = RealVec<double>::set1(v);
140 RealVec<double> second = RealVec<double>::set1(v);
141 return RealAccumVecSPDP(first, second);
145 static void horizontal_add_and_store(
const RealAccumVecSPDP& a,
double *
const mem_addr) {
146 RealVec<double> sum = a._first + a._second;
147 RealVec<double>::horizontal_add_and_store(sum, mem_addr);
151 void aligned_load_add_store(
double * location)
const {
152 const size_t offset =
sizeof(RealVec<double>) /
sizeof(
double);
153 _first.aligned_load_add_store(location);
154 _second.aligned_load_add_store(location + offset);
158 static RealAccumVecSPDP scal_prod(
159 const RealAccumVecSPDP& a1,
const RealAccumVecSPDP& a2,
const RealAccumVecSPDP& a3,
160 const RealAccumVecSPDP& b1,
const RealAccumVecSPDP& b2,
const RealAccumVecSPDP& b3) {
161 return fmadd(a1, b1, fmadd(a2, b2, a3 * b3));
165#if VCP_VEC_TYPE == VCP_VEC_KNL_GATHER or VCP_VEC_TYPE == VCP_VEC_AVX512F_GATHER
167 static RealAccumVecSPDP gather_load(
const double *
const src,
const size_t& offset,
const vcp_lookupOrMask_vec& lookup) {
168 __m256i lookup_256i_lo = _mm512_extracti64x4_epi64(lookup, 0);
169 __m256i lookup_256i_hi = _mm512_extracti64x4_epi64(lookup, 1);
170 RealVec<double> first (_mm512_i32gather_pd(lookup_256i_lo, src, 8));
171 RealVec<double> second (_mm512_i32gather_pd(lookup_256i_hi, src, 8));
172 return RealAccumVecSPDP(first, second);
176 void scatter_store(
double*
const addr,
const size_t& offset,
const vcp_lookupOrMask_vec& lookup) {
177 __m256i lookup_256i_lo = _mm512_extracti64x4_epi64(lookup, 0);
178 __m256i lookup_256i_hi = _mm512_extracti64x4_epi64(lookup, 1);
179 _mm512_i32scatter_pd(addr, lookup_256i_lo, _first, 8);
180 _mm512_i32scatter_pd(addr, lookup_256i_hi, _second, 8);
184 void scatter_store_mask(
double*
const addr,
const size_t& offset,
const vcp_lookupOrMask_vec& lookup,
const MaskVec<float>& mask) {
185 __m256i lookup_256i_lo = _mm512_extracti64x4_epi64(lookup, 0);
186 __m256i lookup_256i_hi = _mm512_extracti64x4_epi64(lookup, 1);
187 MaskVec<double> m_lo, m_hi;
188 convert_mask_vec(mask, m_lo, m_hi);
189 _mm512_mask_i32scatter_pd(addr, m_lo, lookup_256i_lo, _first, 8);
190 _mm512_mask_i32scatter_pd(addr, m_hi, lookup_256i_hi, _second, 8);
195 static RealVec<double> convert_low(
const RealCalcVec& rhs) {
196 #if VCP_VEC_WIDTH == VCP_VEC_W__64
198 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
199 return _mm_cvtps_pd(rhs);
200 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
201 return _mm256_cvtps_pd(_mm256_extractf128_ps(rhs, 0));
202 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
203 return _mm512_cvtps_pd(_mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(rhs), 0)));
208 static RealVec<double> convert_high(
const RealCalcVec& rhs) {
209 #if VCP_VEC_WIDTH == VCP_VEC_W__64
211 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
212 return _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(rhs), 8)));
213 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
214 return _mm256_cvtps_pd(_mm256_extractf128_ps(rhs, 1));
215 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
216 return _mm512_cvtps_pd(_mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(rhs), 1)));
221 static RealCalcVec back_convert(
const RealVec<double>& lo,
const RealVec<double>& hi) {
222 #if VCP_VEC_WIDTH == VCP_VEC_W__64
224 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
225 __m128 c_lo = _mm_cvtpd_ps(lo);
226 __m128 c_hi = _mm_cvtpd_ps(hi);
227 return _mm_movelh_ps(c_lo, c_hi);
228 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
229 __m128 c_lo = _mm256_cvtpd_ps(lo);
230 __m128 c_hi = _mm256_cvtpd_ps(hi);
232 __m256 ret = _mm256_castps128_ps256(c_lo);
233 ret = _mm256_insertf128_ps(ret, c_hi, 1);
236 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
237 __m256 c_lo = _mm512_cvtpd_ps(lo);
238 __m256 c_hi = _mm512_cvtpd_ps(hi);
240 __m512 ret = _mm512_castps256_ps512(c_lo);
241 ret = _mm512_insertf32x8(ret, c_hi, 1);
248 static void convert_mask_vec(
const MaskVec<float>& src, MaskVec<double>& lo, MaskVec<double>& hi) {
249 #if VCP_VEC_WIDTH == VCP_VEC_W__64
251 #elif VCP_VEC_WIDTH == VCP_VEC_W_128
253 lo = _mm_unpacklo_epi32(src, src);
254 hi = _mm_unpackhi_epi32(src, src);
256 #elif VCP_VEC_WIDTH == VCP_VEC_W_256
257 __m256 v_3210 = _mm256_castsi256_ps(src);
259 __m256i v_2200 = _mm256_castps_si256(_mm256_unpacklo_ps(v_3210, v_3210));
260 __m256i v_3311 = _mm256_castps_si256(_mm256_unpackhi_ps(v_3210, v_3210));
263 auto v_10 = _mm256_extract_epi64 (v_3311, 0);
264 auto v_11 = _mm256_extract_epi64 (v_3311, 1);
266 auto v_20 = _mm256_extract_epi64 (v_2200, 2);
267 auto v_21 = _mm256_extract_epi64 (v_2200, 3);
269 __m256i v_2100 = _mm256_insert_epi64 (v_2200, v_10, 2);
270 __m256i v_1100 = _mm256_insert_epi64 (v_2100, v_11, 3);
273 __m256i v_3312 = _mm256_insert_epi64 (v_3311, v_20, 0);
274 __m256i v_3322 = _mm256_insert_epi64 (v_3312, v_21, 1);
277 #elif VCP_VEC_WIDTH == VCP_VEC_W_512
285 lo = merged._narrow[0];
286 hi = merged._narrow[1];
292#elif VCP_VEC_WIDTH == VCP_VEC_W__64
Definition: MaskVecDouble.h:18
Definition: MaskVecFloat.h:18
Definition: RealAccumVecSPDP.h:294
Definition: RealVecDouble.h:18