12#ifndef SIMD_DEFINITIONS_H
13#define SIMD_DEFINITIONS_H
16#define vcp_inline inline __attribute__((always_inline))
18#define vcp_inline inline
32#if VCP_PREC == VCP_SPDP
33#include "RealAccumVecSPDP.h"
45constexpr
size_t VCP_VEC_SIZE =
sizeof(
vcp::RealCalcVec) /
sizeof(vcp_real_calc);
46constexpr
size_t VCP_VEC_SIZE_M1 = VCP_VEC_SIZE - 1u;
48constexpr
size_t VCP_INDICES_PER_LOOKUP_SINGLE = (VCP_VEC_TYPE != VCP_VEC_KNL) and (VCP_VEC_TYPE != VCP_VEC_AVX512F) ? 1u : VCP_VEC_SIZE;
49constexpr
size_t VCP_INDICES_PER_LOOKUP_SINGLE_M1 = (VCP_VEC_TYPE != VCP_VEC_KNL) and (VCP_VEC_TYPE != VCP_VEC_AVX512F) ? 0u : VCP_VEC_SIZE_M1;
51constexpr
size_t VCP_ALIGNMENT = (VCP_VEC_TYPE != VCP_NOVEC) ?
sizeof(
vcp::RealCalcVec) : 8u;
64 #error "SIMD_DEFINITIONS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
67#if VCP_VEC_TYPE==VCP_NOVEC
68 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& ){
71 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& ){
75#elif VCP_VEC_TYPE==VCP_VEC_SSE3
76 #if VCP_PREC == VCP_SPSP or VCP_PREC == VCP_SPDP
77 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& i){
78 switch (i &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
79 case 0:
return _mm_set_epi32(~0, ~0, ~0, ~0);
80 case 1:
return _mm_set_epi32(~0, ~0, ~0, 0);
81 case 2:
return _mm_set_epi32(~0, ~0, 0, 0);
82 default:
return _mm_set_epi32(~0, 0, 0, 0);
85 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& size) {
86 switch (size &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
87 case 0:
return _mm_set_epi32(0, 0, 0, 0);
88 case 1:
return _mm_set_epi32(0, 0, 0, ~0);
89 case 2:
return _mm_set_epi32(0, 0, ~0, ~0);
90 default:
return _mm_set_epi32(0, ~0, ~0, ~0);
94 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& i){
95 switch (i &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
96 case 0:
return _mm_set_epi32(~0, ~0, ~0, ~0);
97 default:
return _mm_set_epi32(~0, ~0, 0, 0);
100 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& size) {
101 switch (size &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
102 case 0:
return _mm_set_epi32(0, 0, 0, 0);
103 default:
return _mm_set_epi32(0, 0, ~0, ~0);
107#elif VCP_VEC_TYPE==VCP_VEC_AVX or VCP_VEC_TYPE==VCP_VEC_AVX2
108 #if VCP_PREC == VCP_SPSP or VCP_PREC == VCP_SPDP
109 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& i){
110 switch (i &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
111 case 0:
return _mm256_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0);
112 case 1:
return _mm256_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, 0);
113 case 2:
return _mm256_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, 0, 0);
114 case 3:
return _mm256_set_epi32(~0, ~0, ~0, ~0, ~0, 0, 0, 0);
115 case 4:
return _mm256_set_epi32(~0, ~0, ~0, ~0, 0, 0, 0, 0);
116 case 5:
return _mm256_set_epi32(~0, ~0, ~0, 0, 0, 0, 0, 0);
117 case 6:
return _mm256_set_epi32(~0, ~0, 0, 0, 0, 0, 0, 0);
118 default:
return _mm256_set_epi32(~0, 0, 0, 0, 0, 0, 0, 0);
121 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& size) {
122 switch (size &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
123 case 0:
return _mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, 0);
124 case 1:
return _mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, ~0);
125 case 2:
return _mm256_set_epi32( 0, 0, 0, 0, 0, 0, ~0, ~0);
126 case 3:
return _mm256_set_epi32( 0, 0, 0, 0, 0, ~0, ~0, ~0);
127 case 4:
return _mm256_set_epi32( 0, 0, 0, 0, ~0, ~0, ~0, ~0);
128 case 5:
return _mm256_set_epi32( 0, 0, 0, ~0, ~0, ~0, ~0, ~0);
129 case 6:
return _mm256_set_epi32( 0, 0, ~0, ~0, ~0, ~0, ~0, ~0);
130 default:
return _mm256_set_epi32( 0, ~0, ~0, ~0, ~0, ~0, ~0, ~0);
134 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& i){
135 switch (i &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
136 case 0:
return _mm256_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0);
137 case 1:
return _mm256_set_epi32(~0, ~0, ~0, ~0, ~0, ~0, 0, 0);
138 case 2:
return _mm256_set_epi32(~0, ~0, ~0, ~0, 0, 0, 0, 0);
139 default:
return _mm256_set_epi32(~0, ~0, 0, 0, 0, 0, 0, 0);
142 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& size) {
143 switch (size &
static_cast<size_t>(VCP_VEC_SIZE_M1)) {
144 case 0:
return MaskCalcVec::zero();
145 case 1:
return _mm256_set_epi32(0, 0, 0, 0, 0, 0, ~0, ~0);
146 case 2:
return _mm256_set_epi32(0, 0, 0, 0, ~0, ~0, ~0, ~0);
147 default:
return _mm256_set_epi32(0, 0, ~0, ~0, ~0, ~0, ~0, ~0);
151#elif VCP_VEC_WIDTH==VCP_VEC_W_512
153 #if VCP_PREC == VCP_SPSP or VCP_PREC == VCP_SPDP
154 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& i){
155 static const MaskCalcVec possibleInitJMasks[VCP_VEC_SIZE] = { 0xFFFF, 0xFFFE, 0xFFFC, 0xFFF8, 0xFFF0, 0xFFE0, 0xFFC0, 0xFF80,
156 0xFF00, 0xFE00, 0xFC00, 0xF800, 0xF000, 0xE000, 0xC000, 0x8000 };
157 return possibleInitJMasks[i &
static_cast<size_t>(VCP_VEC_SIZE_M1)];
160 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& size) {
161 static const MaskCalcVec possibleRemainderJMasks[VCP_VEC_SIZE] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F,
162 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF };
163 return possibleRemainderJMasks[size &
static_cast<size_t>(VCP_VEC_SIZE_M1)];
166 static vcp_inline
MaskCalcVec vcp_simd_getInitMask(
const size_t& i){
167 static const MaskCalcVec possibleInitJMasks[VCP_VEC_SIZE] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 };
168 return possibleInitJMasks[i &
static_cast<size_t>(VCP_VEC_SIZE_M1)];
171 static vcp_inline
MaskCalcVec vcp_simd_getRemainderMask(
const size_t& size) {
172 static const MaskCalcVec possibleRemainderJMasks[VCP_VEC_SIZE] = { 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F };
173 return possibleRemainderJMasks[size &
static_cast<size_t>(VCP_VEC_SIZE_M1)];
184static vcp_inline T vcp_ceil_to_vec_size(
const T& num){
185 return (num +
static_cast<T
>(VCP_VEC_SIZE_M1)) & (~~static_cast<T>(VCP_VEC_SIZE_M1));
194static vcp_inline T vcp_floor_to_vec_size(
const T& num){
195 return num & (~~static_cast<T>(VCP_VEC_SIZE_M1));
Defines the length of the vectors and the corresponding functions.
Definition: RealAccumVecSPDP.h:294