ls1-MarDyn
ls1-MarDyn molecular dynamics code
AlignedArray.h
Go to the documentation of this file.
1
6#ifndef ALIGNEDARRAY_H_
7#define ALIGNEDARRAY_H_
8
9#ifdef __SSE3__
10#include <xmmintrin.h>
11#endif
12#include <malloc.h>
13#include <new>
14#include <cstring>
15#include <vector>
16#include "utils/mardyn_assert.h"
17#include "AlignedAllocator.h"
18
19#define CACHE_LINE_SIZE 64
20
21// TODO: managing this class is becoming too complicated
22// mainly because of the need to remember what happens when,
23// since our functions don't follow standard naming rules
24// and conventions stemming from std::vector.
25//
26// Switch to std::vector with a custom allocator
27// so that it is at least clear what happens when.
28//
29// Regarding prefetching on Xeon Phi, consider the following text, coming from here:
30// https://tianyuliukingcrimson.wordpress.com/2015/07/01/prefetch-on-intel-mic-coprocessor-and-xeon-cpu/
31 //Prefetch instruction
32 //
33 //Let’s take a look at two orthogonal concepts first:
34 //
35 // non-temporal hint (NTA) — informs that data will be used only once in the future and causes them to be evicted from the cache after the first use (most recently used data to be evicted).
36 // exclusive hint (E) — renders the cache line on the current core in the “exclusive” state, where the cache lines on other cores are invalidated.
37 //
38 //The combination of temporality, exclusiveness, and locality (L1 or L2) together yields 8 types of instructions supported by the present-day Knights Corner MIC. They specify how the data are expected to be uniquely handled in the cache, enumerated below.
39 //instruction hint purpose
40 //vprefetchnta _MM_HINT_NTA loads data to L1 and L2 cache, marks it as NTA
41 //vprefetch0 _MM_HINT_T0 loads data to L1 and L2 cache
42 //vprefetch1 _MM_HINT_T1 loads data to L2 cache only
43 //vprefetch2 _MM_HINT_T2 loads data to L2 cache only, marks it as NTA This mnemonic is counter-intuitive as there is not NTA in it
44 //vprefetchenta _MM_HINT_ENTA exclusive version of vprefetchnta
45 //vprefetche0 _MM_HINT_ET0 exclusive version of vprefetch0
46 //vprefetche1 _MM_HINT_ET1 exclusive version of vprefetch1
47 //vprefetche2 _MM_HINT_ET2 exclusive version of vprefetch2
48 //
49 //Note L2 cache of the MIC is inclusive in the sense that it has a copy of all the data in L1.
50 //
51 //There are two ways of implementing prefetch in C — intrinsic and assembly.
52 //
54 //_mm_prefetch((const char*)addr, hint);
55 //
57 //asm volatile ("prefetch_inst [%0]"::"m"(addr));
58 //
59 //Here addr is the address of the byte starting from which to prefetch, prefetch_inst is the prefetch instructions listed above, and hint is the parameter for the compiler intrinsic. We would like to emphasize again that _MM_HINT_T2 and _MM_HINT_ET2 are counter-intuitive. In fact they are misnomers as both are non-temporary. They should have been named as _MM_HINT_NTA2 and _MM_HINT_ENTA2 by Intel.
60
61// Prefetching on Xeons features much fewer hints, apparently! (same link):
62 //prefetchnta _MM_HINT_NTA loads data to L2 and L3 cache, marks as NTA
63 //prefetcht0 _MM_HINT_T0 loads data to L2 and L3 cache
64 //prefetcht1 _MM_HINT_T1 equivalent to prefetch0
65 //prefetcht2 _MM_HINT_T2 equivalent to prefetch0
66
74template<class T, size_t alignment = CACHE_LINE_SIZE>
76public:
81 _vec(0) {
82 }
83
87 AlignedArray(size_t n) :
88 _vec(n) {
89 }
90
95 _vec(a._vec) {
96 }
97
102 _vec = a._vec;
103 return *this;
104 }
105
109 virtual ~AlignedArray() {
110 }
111
112#if defined(__SSE3__) or defined(__MIC__)
113 virtual void prefetch(int hint = 1, int n = -1) const {
114 mardyn_assert(n >= -2);
115
116 size_t endPrefetch;
117 const int stride = _round_up(1);
118
119 switch(n) {
120 case -1:
121 // prefetch all up to capacity()
122 endPrefetch = _vec.capacity();
123 break;
124 case -2:
125 // prefetch all up to size()
126 endPrefetch = _vec.size();
127 break;
128 default:
129 // prefetch only first n elements
130 endPrefetch = n;
131 }
132
133 for (size_t i = 0; i < endPrefetch; i+= stride) {
134 const T & val = _vec[i];
135 const T * valP = &val;
136#if defined(__MIC__)
137 _mm_prefetch((const char*)valP, 2);
138#else
139 _mm_prefetch((const char*)valP, _MM_HINT_T1);
140#endif
141 }
142 }
143#else
144 virtual void prefetch(int /*hint = 1*/, int /*n = -1*/) const {}
145#endif
146
147 virtual void increaseStorage(size_t oldNumElements, size_t additionalElements) {
148 mardyn_assert(oldNumElements <= _vec.capacity());
149
150 size_t newNumElements = oldNumElements + additionalElements;
151
152 if (newNumElements <= _vec.capacity()) {
153 // no need to resize
154 return;
155 }
156
157 // we need to resize, but also keep contents
158 _vec.reserve(_round_up(newNumElements));
159 _vec.resize(_vec.capacity());
160 }
161
162 void appendValue(T v, size_t oldNumElements) {
163 increaseStorage(oldNumElements, 1);
164
165 _vec[oldNumElements] = v;
166 }
167
168 virtual size_t resize_zero_shrink(size_t exact_size, bool zero_rest_of_CL =
169 false, bool allow_shrink = false) {
170 size_t size_rounded_up = _round_up(exact_size);
171
172 bool need_resize = size_rounded_up > _vec.size()
173 or (allow_shrink and size_rounded_up < _vec.size());
174
175 if (need_resize) {
176 _vec.reserve(size_rounded_up);
177 _vec.resize(_vec.capacity());
178 }
179 // we might still need to zero the rest of the Cache Line
180 if (zero_rest_of_CL and size_rounded_up > 0) {
181 std::memset(_vec.data() + exact_size, 0,
182 size_rounded_up - exact_size);
183 }
184
185 mardyn_assert(size_rounded_up <= _vec.size());
186 return _vec.size();
187 }
188
192 virtual void resize(size_t n) {
193 _vec.reserve(_round_up(n));
194 _vec.resize(_vec.capacity());
195 }
196
197 virtual void zero(size_t start_idx = 0) {
198 if (_vec.size() > 0 and start_idx < _vec.capacity()) {
199 size_t num_to_zero = _vec.capacity() - start_idx;
200 std::memset(_vec.data() + start_idx, 0, num_to_zero * sizeof(T));
201 }
202 }
203
207 inline size_t get_size() const {
208 return _vec.size();
209 }
210
214 operator T*() {
215 return _vec.data();
216 }
217
218 operator const T*() const {
219 return _vec.data();
220 }
221
225 size_t get_dynamic_memory() const {
226 return _vec.capacity() * sizeof(T);
227 }
228
229 static size_t _round_up(size_t n) {
230 unsigned long j = alignment / sizeof(T) - 1;
231 unsigned long ret = (n + j) & ~j;
232 return ret;
233 }
234
235protected:
236
237 std::vector<T, AlignedAllocator<T, alignment>> _vec;
238
239};
240
241#endif
242
An aligned array.
Definition: AlignedArray.h:75
AlignedArray & operator=(const AlignedArray &a)
Assign a copy of another AlignedArray.
Definition: AlignedArray.h:101
AlignedArray(const AlignedArray &a)
Construct a copy of another AlignedArray.
Definition: AlignedArray.h:94
size_t get_size() const
Return current size in terms of elements.
Definition: AlignedArray.h:207
size_t get_dynamic_memory() const
Return amount of allocated storage + .
Definition: AlignedArray.h:225
AlignedArray(size_t n)
Construct an array of n elements.
Definition: AlignedArray.h:87
virtual void resize(size_t n)
Reallocate the array. All content may be lost.
Definition: AlignedArray.h:192
AlignedArray()
Construct an empty array.
Definition: AlignedArray.h:80
virtual ~AlignedArray()
Free the array.
Definition: AlignedArray.h:109