T-SIMD v31.1.2
A C++ template SIMD library
Loading...
Searching...
No Matches
mask_impl_emu.H
1// ===========================================================================
2//
3// emulated mask functions
4// Author: Markus Vieth (Bielefeld University, mvieth@techfak.uni-bielefeld.de)
5// Year of creation: 2019
6//
7// This source code file is part of the following software:
8//
9// - the low-level C++ template SIMD library
10// - the SIMD implementation of the MinWarping and the 2D-Warping methods
11// for local visual homing.
12//
13// The software is provided based on the accompanying license agreement in the
14// file LICENSE.md.
15// The software is provided "as is" without any warranty by the licensor and
16// without any liability of the licensor, and the software may not be
17// distributed by the licensee; see the license agreement for details.
18//
19// (C) Markus Vieth, Ralf Möller
20// Computer Engineering
21// Faculty of Technology
22// Bielefeld University
23// www.ti.uni-bielefeld.de
24//
25// ===========================================================================
26
27// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
28// namespace
29
30// 01. Feb 23 (Jonas Keller): implemented the emulated mask functions in a more
31// efficient way, described below and also optimized other small things.
32// 30. Nov 22 (Jonas Keller):
33// NOTE:
34// The float versions of the emulated mask functions in this file as well as in
35// SIMDVecMaskImplIntel64.H are not as fast as they could be, as they are
36// implemented such that they match the not emulated ones in flag and exception
37// behavior as well. This is done by masking the inputs of the masked functions,
38// which for example leads to the following code for masked addition:
39/*
40template <size_t SIMD_WIDTH>
41static SIMD_INLINE Vec<Float, SIMD_WIDTH>
42maskz_add(const Mask<Float, SIMD_WIDTH> &k,
43 const Vec<Float, SIMD_WIDTH> &a,
44 const Vec<Float, SIMD_WIDTH> &b)
45{
46 return add(mask_ifelse(k, a, setzero<Float, SIMD_WIDTH>()),
47 mask_ifelse(k, b, setzero<Float, SIMD_WIDTH>()));
48}
49template <size_t SIMD_WIDTH>
50static SIMD_INLINE Vec<Float, SIMD_WIDTH>
51mask_add(const Vec<Float, SIMD_WIDTH> &src,
52 const Mask<Float, SIMD_WIDTH> &k,
53 const Vec<Float, SIMD_WIDTH> &a,
54 const Vec<Float, SIMD_WIDTH> &b)
55{
56 return mask_ifelse(k, maskz_add(k, a, b), src);
57}
58*/
59// which calls mask_ifelse 3 times for one call of mask_add, instead of
60/*
61template <size_t SIMD_WIDTH>
62static SIMD_INLINE Vec<Float, SIMD_WIDTH>
63maskz_add(const Mask<Float, SIMD_WIDTH> &k,
64 const Vec<Float, SIMD_WIDTH> &a,
65 const Vec<Float, SIMD_WIDTH> &b)
66{
67 return mask_ifelse(k, add(a, b), setzero<Float, SIMD_WIDTH>());
68}
69template <size_t SIMD_WIDTH>
70static SIMD_INLINE Vec<Float, SIMD_WIDTH>
71mask_add(const Vec<Float, SIMD_WIDTH> &src,
72 const Mask<Float, SIMD_WIDTH> &k,
73 const Vec<Float, SIMD_WIDTH> &a,
74 const Vec<Float, SIMD_WIDTH> &b)
75{
76 return mask_ifelse(k, add(a, b), k);
77}
78*/
79// which calls mask_ifelse only once for one call of mask_add.
80//
81// The second version would however for example set the denormal flag if an
82// input is denormalized, even if the corresponding mask bit is not set, which
83// is different from the behavior of the not emulated mask functions.
84//
85// It may be worth considering to implement the emulated mask functions
86// analogous to the second version to improve performance. This would
87// change the flag/exception behavior of the emulated mask functions.
88// However, the flag/exception behavior is probably not correct in the
89// whole library anyway, and probably also different on ARM.
90// Additionally, the T-SIMD does not provide an architecture independent
91// way to use flags or exceptions, so emulating them does not make much
92// sense anyway.
93
94#pragma once
95#ifndef SIMD_VEC_MASK_IMPL_EMU_H_
96#define SIMD_VEC_MASK_IMPL_EMU_H_
97
98#include "alloc.H"
99#include "base.H"
100#include "defs.H"
101#include "intel/base_impl_intel64.H"
102#include "types.H"
103#include "vec.H"
104
105#include <cstddef>
106#include <cstdint>
107
108#ifndef SIMDVEC_SANDBOX
109
110namespace simd {
111// 05. Feb 23 (Jonas Keller): introduced generic emulated Mask class using the
112// Vec class
113
114// exclude from doxygen (until endcond)
116
117template <typename T, size_t SIMD_WIDTH>
118class Mask
119{
120 Vec<T, SIMD_WIDTH> mask;
121
122public:
123 Mask() = default;
124 explicit SIMD_INLINE Mask(const Vec<T, SIMD_WIDTH> &x)
125 {
126 // shift the most significant bit into all bits
127 const auto &xInt = reinterpret<typename TypeInfo<T>::IntegerType>(x);
128 const auto &shifted = srai<sizeof(T) * 8 - 1>(xInt);
129 mask = reinterpret<T>(shifted);
130 }
131 SIMD_INLINE Mask(const uint64_t x) : mask(int2bits<T, SIMD_WIDTH>(x)) {}
132 explicit SIMD_INLINE operator Vec<T, SIMD_WIDTH>() const { return mask; }
133 SIMD_INLINE operator uint64_t() const { return msb2int<T, SIMD_WIDTH>(mask); }
134 SIMD_INLINE bool operator[](const size_t i) const
135 {
136 if (i >= Vec<T, SIMD_WIDTH>::elems) { return false; }
137 T mask_array[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH) = {0};
138 store(mask_array, mask);
139 return mask_array[i] != T(0);
140 }
141 SIMD_INLINE bool operator==(const Mask<T, SIMD_WIDTH> &other) const
142 {
143 return test_all_zeros(
144 bit_xor(reinterpret<Int>(mask), reinterpret<Int>(other.mask)));
145 }
146 // define operators new and delete to ensure proper alignment, since
147 // the default new and delete are not guaranteed to do so before C++17
148 void *operator new(size_t size) { return aligned_malloc(SIMD_WIDTH, size); }
149 void operator delete(void *p) { aligned_free(p); }
150 void *operator new[](size_t size) { return aligned_malloc(SIMD_WIDTH, size); }
151 void operator delete[](void *p) { aligned_free(p); }
152};
154
155namespace internal {
156namespace mask {
157#define EMULATE_SOP_NAME(OP, OP_NAME) \
158 template <typename T, size_t SIMD_WIDTH> \
159 static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_##OP_NAME( \
160 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a) \
161 { \
162 return mask::mask_ifelsezero(k, OP(a)); \
163 } \
164 template <typename T, size_t SIMD_WIDTH> \
165 static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_##OP_NAME( \
166 const Vec<T, SIMD_WIDTH> &src, const Mask<T, SIMD_WIDTH> &k, \
167 const Vec<T, SIMD_WIDTH> &a) \
168 { \
169 return mask::mask_ifelse(k, OP(a), src); \
170 }
171
172#define EMULATE_SOP(OP) EMULATE_SOP_NAME(OP, OP)
173
174#define EMULATE_DOP_NAME(OP, OP_NAME) \
175 template <typename T, size_t SIMD_WIDTH> \
176 static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_##OP_NAME( \
177 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a, \
178 const Vec<T, SIMD_WIDTH> &b) \
179 { \
180 return mask::mask_ifelsezero(k, OP(a, b)); \
181 } \
182 template <typename T, size_t SIMD_WIDTH> \
183 static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_##OP_NAME( \
184 const Vec<T, SIMD_WIDTH> &src, const Mask<T, SIMD_WIDTH> &k, \
185 const Vec<T, SIMD_WIDTH> &a, const Vec<T, SIMD_WIDTH> &b) \
186 { \
187 return mask::mask_ifelse(k, OP(a, b), src); \
188 }
189
190#define EMULATE_DOP(OP) EMULATE_DOP_NAME(OP, OP)
191
192template <typename T, size_t SIMD_WIDTH>
193static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_ifelse(
194 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &trueVal,
195 const Vec<T, SIMD_WIDTH> &falseVal)
196{
197 return ifelse((Vec<T, SIMD_WIDTH>) k, trueVal, falseVal);
198}
199
200// 04. Aug 22 (Jonas Keller): added mask_ifelsezero
201template <typename T, size_t SIMD_WIDTH>
202static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_ifelsezero(
203 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &trueVal)
204{
205 return bit_and((Vec<T, SIMD_WIDTH>) k, trueVal);
206}
207
208template <typename Tout, typename Tin, size_t SIMD_WIDTH>
209static SIMD_INLINE Mask<Tout, SIMD_WIDTH> reinterpret_mask(
210 const Mask<Tin, SIMD_WIDTH> &k)
211{
212 static_assert(sizeof(Tout) == sizeof(Tin), "");
213 return Mask<Tout, SIMD_WIDTH>(reinterpret<Tout>((Vec<Tin, SIMD_WIDTH>) k));
214}
215
216// The types of the masks are kind of arbitrary
217template <size_t SIMD_WIDTH>
218SIMD_INLINE Vec<Int, SIMD_WIDTH> maskz_cvts(const Mask<Float, SIMD_WIDTH> &k,
219 const Vec<Float, SIMD_WIDTH> &a)
220{
221 return mask::mask_ifelsezero(mask::reinterpret_mask<Int>(k),
223}
224
225template <size_t SIMD_WIDTH>
226SIMD_INLINE Vec<Int, SIMD_WIDTH> mask_cvts(const Vec<Int, SIMD_WIDTH> &src,
227 const Mask<Float, SIMD_WIDTH> &k,
228 const Vec<Float, SIMD_WIDTH> &a)
229{
230 return mask::mask_ifelse(mask::reinterpret_mask<Int>(k), ::simd::cvts<Int>(a),
231 src);
232}
233
234template <size_t SIMD_WIDTH>
235SIMD_INLINE Vec<Float, SIMD_WIDTH> maskz_cvts(const Mask<Int, SIMD_WIDTH> &k,
236 const Vec<Int, SIMD_WIDTH> &a)
237{
238 return mask::mask_ifelsezero(mask::reinterpret_mask<Float>(k),
240}
241
242template <size_t SIMD_WIDTH>
243SIMD_INLINE Vec<Float, SIMD_WIDTH> mask_cvts(const Vec<Float, SIMD_WIDTH> &src,
244 const Mask<Int, SIMD_WIDTH> &k,
245 const Vec<Int, SIMD_WIDTH> &a)
246{
247 return mask::mask_ifelse(mask::reinterpret_mask<Float>(k),
248 ::simd::cvts<Float>(a), src);
249}
250
251// =======================================================================
252// emulated load/store
253// =======================================================================
254
255// 04. Feb 23 (Jonas Keller): improved implementation of masked load/store
256// functions
257
258template <size_t SIMD_WIDTH, typename T>
259static SIMD_INLINE bool is_within_same_page(const T *const p)
260{
261 const uintptr_t PAGE_SIZE = 4096; // smallest page size I found
262 const uintptr_t begin_page =
263 reinterpret_cast<uintptr_t>(p) & ~(PAGE_SIZE - 1);
264 // 29. Aug 23 (Jonas Keller): fixed wrong calculation of end_page
265 const uintptr_t end_page =
266 // reinterpret_cast<uintptr_t>(p + Vec<T, SIMD_WIDTH>::elems - 1) &
267 // ~(PAGE_SIZE - 1);
268 (reinterpret_cast<uintptr_t>(p) + SIMD_WIDTH - 1) & ~(PAGE_SIZE - 1);
269 return begin_page == end_page;
270}
271
272template <typename T, size_t SIMD_WIDTH>
273static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_load(const Mask<T, SIMD_WIDTH> &k,
274 const T *const p)
275{
276 // if k is all zeros nothing should be loaded
277 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) {
278 return setzero<T, SIMD_WIDTH>();
279 }
280 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
281 // there is no risk of a page fault, so we load the whole vector and mask
282 // it. Otherwise, we load the vector element-wise.
283 if (is_within_same_page<SIMD_WIDTH>(p)) {
284 return mask::mask_ifelsezero(k, load<SIMD_WIDTH>(p));
285 }
286 // if k is all ones, we can load the whole vector
287 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return load<SIMD_WIDTH>(p); }
288 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
289 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
290 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH) = {0};
291 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
292 if (k_arr[i] != T(0)) { result[i] = p[i]; }
293 }
294 return load<SIMD_WIDTH>(result);
295}
296
297template <typename T, size_t SIMD_WIDTH>
298static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_load(const Vec<T, SIMD_WIDTH> &src,
299 const Mask<T, SIMD_WIDTH> &k,
300 const T *const p)
301{
302 // if k is all zeros nothing should be loaded
303 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return src; }
304 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
305 // there is no risk of a page fault, so we load the whole vector and mask
306 // it. Otherwise, we load the vector element-wise.
307 if (is_within_same_page<SIMD_WIDTH>(p)) {
308 return mask::mask_ifelse(k, load<SIMD_WIDTH>(p), src);
309 }
310 // if k is all ones, we can load the whole vector
311 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return load<SIMD_WIDTH>(p); }
312 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
313 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
314 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
315 store(result, src);
316 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
317 if (k_arr[i] != T(0)) { result[i] = p[i]; }
318 }
319 return load<SIMD_WIDTH>(result);
320}
321
322template <typename T, size_t SIMD_WIDTH>
323static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_loadu(const Mask<T, SIMD_WIDTH> &k,
324 const T *const p)
325{
326 // if k is all zeros nothing should be loaded
327 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) {
328 return setzero<T, SIMD_WIDTH>();
329 }
330 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
331 // there is no risk of a page fault, so we load the whole vector and mask
332 // it. Otherwise, we load the vector element-wise.
333 if (is_within_same_page<SIMD_WIDTH>(p)) {
334 return mask::mask_ifelsezero(k, loadu<SIMD_WIDTH>(p));
335 }
336 // if k is all ones, we can load the whole vector
337 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return loadu<SIMD_WIDTH>(p); }
338 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
339 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
340 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH) = {0};
341 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
342 if (k_arr[i] != T(0)) { result[i] = p[i]; }
343 }
344 return load<SIMD_WIDTH>(result);
345}
346
347template <typename T, size_t SIMD_WIDTH>
348static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_loadu(const Vec<T, SIMD_WIDTH> &src,
349 const Mask<T, SIMD_WIDTH> &k,
350 const T *const p)
351{
352 // if k is all zeros nothing should be loaded
353 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return src; }
354 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
355 // there is no risk of a page fault, so we load the whole vector and mask
356 // it. Otherwise, we load the vector element-wise.
357 if (is_within_same_page<SIMD_WIDTH>(p)) {
358 return mask::mask_ifelse(k, loadu<SIMD_WIDTH>(p), src);
359 }
360 // if k is all ones, we can load the whole vector
361 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return loadu<SIMD_WIDTH>(p); }
362 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
363 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
364 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
365 store(result, src);
366 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
367 if (k_arr[i] != T(0)) { result[i] = p[i]; }
368 }
369 return load<SIMD_WIDTH>(result);
370}
371
372template <typename T, size_t SIMD_WIDTH>
373static SIMD_INLINE void mask_store(T *const p, const Mask<T, SIMD_WIDTH> &k,
374 const Vec<T, SIMD_WIDTH> &a)
375{
376 // if k is all zeros nothing should be stored
377 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return; }
378 // if k is all ones, we can store the whole vector
379 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) {
380 store(p, a);
381 return;
382 }
383 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
384 // there is no risk of a page fault, so we load the whole vector, mask it
385 // and store it back. Otherwise, we store the vector element-wise.
386 if (is_within_same_page<SIMD_WIDTH>(p)) {
387 store(p, mask::mask_ifelse(k, a, load<SIMD_WIDTH>(p)));
388 return;
389 }
390 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
391 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
392 T a_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
393 store(a_arr, a);
394 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
395 if (k_arr[i] != T(0)) { p[i] = a_arr[i]; }
396 }
397}
398
399template <typename T, size_t SIMD_WIDTH>
400static SIMD_INLINE void mask_storeu(T *const p, const Mask<T, SIMD_WIDTH> &k,
401 const Vec<T, SIMD_WIDTH> &a)
402{
403 // if k is all zeros nothing should be stored
404 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return; }
405 // if k is all ones, we can store the whole vector
406 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) {
407 storeu(p, a);
408 return;
409 }
410 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
411 // there is no risk of a page fault, so we load the whole vector, mask it
412 // and store it back. Otherwise, we store the vector element-wise.
413 if (is_within_same_page<SIMD_WIDTH>(p)) {
414 storeu(p, mask::mask_ifelse(k, a, loadu<SIMD_WIDTH>(p)));
415 return;
416 }
417 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
418 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
419 T a_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
420 store(a_arr, a);
421 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
422 if (k_arr[i] != T(0)) { p[i] = a_arr[i]; }
423 }
424}
425
426// maskz_store(u) does not exist/does not make sense
427
428template <typename T, size_t SIMD_WIDTH>
429static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_set1(const Mask<T, SIMD_WIDTH> &k,
430 const T a)
431{
432 return mask::mask_ifelsezero(k, ::simd::set1<T, SIMD_WIDTH>(a));
433}
434template <typename T, size_t SIMD_WIDTH>
435static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_set1(const Vec<T, SIMD_WIDTH> &src,
436 const Mask<T, SIMD_WIDTH> &k,
437 const T a)
438{
439 return mask::mask_ifelse(k, ::simd::set1<T, SIMD_WIDTH>(a), src);
440}
441
442EMULATE_DOP(add)
443EMULATE_DOP(adds)
444EMULATE_DOP(sub)
445EMULATE_DOP(subs)
446
447EMULATE_DOP(mul)
448EMULATE_DOP(div)
449
450// ---------------------------------------------------------------------------
451// masked ceil, floor, round, truncate v
452// ---------------------------------------------------------------------------
453
454EMULATE_SOP(ceil)
455EMULATE_SOP(floor)
456EMULATE_SOP(round)
457EMULATE_SOP(truncate)
458
459// ---------------------------------------------------------------------------
460// masked elementary mathematical functions v
461// ---------------------------------------------------------------------------
462
463EMULATE_SOP(rcp)
464EMULATE_SOP(rsqrt)
465EMULATE_SOP(sqrt)
466
467EMULATE_SOP(abs)
468
469EMULATE_DOP_NAME(bit_and, and)
470EMULATE_DOP_NAME(bit_or, or)
471EMULATE_DOP_NAME(bit_andnot, andnot)
472EMULATE_DOP_NAME(bit_xor, xor)
473EMULATE_SOP_NAME(bit_not, not)
474EMULATE_SOP(neg)
475EMULATE_DOP(min)
476EMULATE_DOP(max)
477EMULATE_SOP(div2r0)
478EMULATE_SOP(div2rd)
479
480#define EMULATE_SHIFT(OP) \
481 template <size_t COUNT, typename T, size_t SIMD_WIDTH> \
482 static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_##OP( \
483 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a) \
484 { \
485 return mask::mask_ifelsezero(k, OP<COUNT>(a)); \
486 } \
487 template <size_t COUNT, typename T, size_t SIMD_WIDTH> \
488 static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_##OP( \
489 const Vec<T, SIMD_WIDTH> &src, const Mask<T, SIMD_WIDTH> &k, \
490 const Vec<T, SIMD_WIDTH> &a) \
491 { \
492 return mask::mask_ifelse(k, OP<COUNT>(a), src); \
493 }
494EMULATE_SHIFT(srai)
495EMULATE_SHIFT(srli)
496EMULATE_SHIFT(slli)
497
498EMULATE_DOP(hadd)
499EMULATE_DOP(hadds)
500EMULATE_DOP(hsub)
501EMULATE_DOP(hsubs)
502
503// TODO mask parameters?
504
505// 16. Oct 22 (Jonas Keller): added overloaded versions of mask_cmp* functions
506// that only take two vector parameters and no mask parameter
507#define EMULATE_CMP(OP) \
508 template <typename T, size_t SIMD_WIDTH> \
509 static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_##OP( \
510 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a, \
511 const Vec<T, SIMD_WIDTH> &b) \
512 { \
513 return Mask<T, SIMD_WIDTH>(mask::mask_ifelsezero(k, OP(a, b))); \
514 } \
515 template <typename T, size_t SIMD_WIDTH> \
516 static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_##OP( \
517 const Vec<T, SIMD_WIDTH> &a, const Vec<T, SIMD_WIDTH> &b) \
518 { \
519 return Mask<T, SIMD_WIDTH>(OP(a, b)); \
520 }
521
522EMULATE_CMP(cmplt)
523EMULATE_CMP(cmple)
524EMULATE_CMP(cmpeq)
525EMULATE_CMP(cmpgt)
526EMULATE_CMP(cmpge)
527EMULATE_CMP(cmpneq)
528
529EMULATE_DOP(avg)
530
531template <typename T, size_t SIMD_WIDTH>
532static SIMD_INLINE bool mask_test_all_zeros(const Mask<T, SIMD_WIDTH> &k,
533 const Vec<T, SIMD_WIDTH> &a)
534{
535 return test_all_zeros(mask::mask_ifelsezero(k, a));
536}
537
538template <typename T, size_t SIMD_WIDTH>
539static SIMD_INLINE bool mask_test_all_ones(const Mask<T, SIMD_WIDTH> &k,
540 const Vec<T, SIMD_WIDTH> &a)
541{
542 return mask::mask_test_all_zeros(
543 k, bit_not(a)); // test_all_ones(mask_ifelse<T, SIMD_WIDTH>(k, a, ()
544 // set1<Byte, SIMD_WIDTH>(0xFF)));
545}
546
547template <typename T, size_t SIMD_WIDTH>
548static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_all_ones(OutputType<T>,
549 Integer<SIMD_WIDTH>)
550{
551 return (Mask<T, SIMD_WIDTH>) set1<T, SIMD_WIDTH>(TypeInfo<T>::trueval());
552}
553
554#define EMULATE_DMASKOP(NAME) \
555 template <typename T, size_t SIMD_WIDTH> \
556 static SIMD_INLINE Mask<T, SIMD_WIDTH> k##NAME(const Mask<T, SIMD_WIDTH> &a, \
557 const Mask<T, SIMD_WIDTH> &b) \
558 { \
559 return (Mask<T, SIMD_WIDTH>) NAME##_((Vec<T, SIMD_WIDTH>) a, \
560 (Vec<T, SIMD_WIDTH>) b); \
561 }
562
563EMULATE_DMASKOP(and)
564
565// EMULATE_DMASKOP(andn)
566// function name should be "kandn" but the vector function is "bit_andnot"
567template <typename T, size_t SIMD_WIDTH>
568static SIMD_INLINE Mask<T, SIMD_WIDTH> kandn(const Mask<T, SIMD_WIDTH> &a,
569 const Mask<T, SIMD_WIDTH> &b)
570{
571 return (Mask<T, SIMD_WIDTH>) bit_andnot((Vec<T, SIMD_WIDTH>) a,
572 (Vec<T, SIMD_WIDTH>) b);
573}
574
575EMULATE_DMASKOP(or)
576EMULATE_DMASKOP(xor)
577
578// EMULATE_DMASKOP(xnor)
579// there is not xnor-function for vectors, so we have to do: bit_not(bit_xor(a,
580// b))
581template <typename T, size_t SIMD_WIDTH>
582static SIMD_INLINE Mask<T, SIMD_WIDTH> kxnor(const Mask<T, SIMD_WIDTH> &a,
583 const Mask<T, SIMD_WIDTH> &b)
584{
585 return (Mask<T, SIMD_WIDTH>) bit_not(
586 bit_xor((Vec<T, SIMD_WIDTH>) a, (Vec<T, SIMD_WIDTH>) b));
587}
588
589template <typename T, size_t SIMD_WIDTH>
590static SIMD_INLINE Mask<T, SIMD_WIDTH> kadd(const Mask<T, SIMD_WIDTH> &a,
591 const Mask<T, SIMD_WIDTH> &b)
592{
593 Mask<T, SIMD_WIDTH> ret;
594 ret = (((uintmax_t) a) + ((uintmax_t) b));
595 return ret;
596}
597
598template <typename T, size_t SIMD_WIDTH>
599static SIMD_INLINE Mask<T, SIMD_WIDTH> knot(const Mask<T, SIMD_WIDTH> &a)
600{
601 return (Mask<T, SIMD_WIDTH>) bit_not((Vec<T, SIMD_WIDTH>) a);
602}
603
604// shift with flexible parameter (not template), probably slower than
605// template-version
606// TODO faster implementation with switch-case possible?
607template <typename T, size_t SIMD_WIDTH>
608static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftri(const Mask<T, SIMD_WIDTH> &a,
609 uintmax_t count)
610{
611 // 04. Aug 22 (Jonas Keller):
612 // return zero if count is larger than sizeof(uintmax_t)*8 - 1, since then
613 // the >> operator is undefined, but kshift should return zero
614 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
615 if (count >= sizeof(uintmax_t) * 8) { return Mask<T, SIMD_WIDTH>(0); }
616 return (Mask<T, SIMD_WIDTH>) (((uintmax_t) a) >> count);
617}
618template <typename T, size_t SIMD_WIDTH>
619static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftli(const Mask<T, SIMD_WIDTH> &a,
620 uintmax_t count)
621{
622 // 04. Aug 22 (Jonas Keller):
623 // return zero if count is larger than sizeof(uintmax_t)*8 - 1, since then
624 // the << operator is undefined, but kshift should return zero
625 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
626 if (count >= sizeof(uintmax_t) * 8) { return Mask<T, SIMD_WIDTH>(0); }
627 return (Mask<T, SIMD_WIDTH>) (((uintmax_t) a) << count);
628}
629
630// shift with template parameter
631template <size_t COUNT, typename T, size_t SIMD_WIDTH>
632static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftri(const Mask<T, SIMD_WIDTH> &a)
633{
634 return (Mask<T, SIMD_WIDTH>) srle<COUNT>((Vec<T, SIMD_WIDTH>) a);
635}
636template <size_t COUNT, typename T, size_t SIMD_WIDTH>
637static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftli(const Mask<T, SIMD_WIDTH> &a)
638{
639 return (Mask<T, SIMD_WIDTH>) slle<COUNT>((Vec<T, SIMD_WIDTH>) a);
640}
641
642// 30. Jan 23 (Jonas Keller): removed setTrueLeft/Right and replaced them with
643// mask_set_true/false_low/high.
644
645template <bool UP, typename T, size_t SIMD_WIDTH>
646struct MaskSetBuffer
647{
648 T buffer[Vec<T, SIMD_WIDTH>::elems * 2];
649 MaskSetBuffer()
650 {
651 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
652 buffer[i] = UP ? 0 : TypeInfo<T>::trueval();
653 }
654 for (size_t i = Vec<T, SIMD_WIDTH>::elems;
655 i < Vec<T, SIMD_WIDTH>::elems * 2; i++) {
656 buffer[i] = UP ? TypeInfo<T>::trueval() : 0;
657 }
658 }
659};
660
661template <typename T, size_t SIMD_WIDTH>
662static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_true_low(const size_t x,
663 OutputType<T>,
664 Integer<SIMD_WIDTH>)
665{
666 if (x >= Vec<T, SIMD_WIDTH>::elems) {
667 return mask_all_ones(OutputType<T>(), Integer<SIMD_WIDTH>());
668 }
669 static MaskSetBuffer<false, T, SIMD_WIDTH> buffer;
670 return Mask<T, SIMD_WIDTH>(
671 loadu<SIMD_WIDTH>(buffer.buffer + Vec<T, SIMD_WIDTH>::elems - x));
672}
673
674template <typename T, size_t SIMD_WIDTH>
675static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_true_high(const size_t x,
676 OutputType<T>,
677 Integer<SIMD_WIDTH>)
678{
679 if (x >= Vec<T, SIMD_WIDTH>::elems) {
680 return mask_all_ones(OutputType<T>(), Integer<SIMD_WIDTH>());
681 }
682 static MaskSetBuffer<true, T, SIMD_WIDTH> buffer;
683 return Mask<T, SIMD_WIDTH>(loadu<SIMD_WIDTH>(buffer.buffer + x));
684}
685
686template <typename T, size_t SIMD_WIDTH>
687static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_false_low(const size_t x,
688 OutputType<T>,
689 Integer<SIMD_WIDTH>)
690{
691 if (x >= Vec<T, SIMD_WIDTH>::elems) { return Mask<T, SIMD_WIDTH>(0); }
692 static MaskSetBuffer<true, T, SIMD_WIDTH> buffer;
693 return Mask<T, SIMD_WIDTH>(
694 loadu<SIMD_WIDTH>(buffer.buffer + Vec<T, SIMD_WIDTH>::elems - x));
695}
696
697template <typename T, size_t SIMD_WIDTH>
698static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_false_high(const size_t x,
699 OutputType<T>,
700 Integer<SIMD_WIDTH>)
701{
702 if (x >= Vec<T, SIMD_WIDTH>::elems) { return Mask<T, SIMD_WIDTH>(0); }
703 static MaskSetBuffer<false, T, SIMD_WIDTH> buffer;
704 return Mask<T, SIMD_WIDTH>(loadu<SIMD_WIDTH>(buffer.buffer + x));
705}
706
707// 07. Aug 23 (Jonas Keller): added ktest_all_zeros/ones.
708
709template <typename T, size_t SIMD_WIDTH>
710static SIMD_INLINE bool ktest_all_zeros(const Mask<T, SIMD_WIDTH> &a)
711{
712 return test_all_zeros((Vec<T, SIMD_WIDTH>) a);
713}
714
715template <typename T, size_t SIMD_WIDTH>
716static SIMD_INLINE bool ktest_all_ones(const Mask<T, SIMD_WIDTH> &a)
717{
718 return test_all_ones((Vec<T, SIMD_WIDTH>) a);
719}
720
721// 07. Aug 23 (Jonas Keller): added kcmpeq
722
723template <typename T, size_t SIMD_WIDTH>
724static SIMD_INLINE bool kcmpeq(const Mask<T, SIMD_WIDTH> &a,
725 const Mask<T, SIMD_WIDTH> &b)
726{
727 return internal::mask::ktest_all_zeros(internal::mask::kxor(a, b));
728}
729
730} // namespace mask
731} // namespace internal
732} // namespace simd
733
734#endif // SIMDVEC_SANDBOX
735
736#endif // SIMD_VEC_MASK_IMPL_EMU_H_
bool operator[](const size_t i) const
Returns the Mask bit at the given index.
Mask(const Vec< T, SIMD_WIDTH > &x)
Constructs a Mask from a Vec.
bool operator==(const Mask< T, SIMD_WIDTH > &other) const
Compares the Mask with another Mask.
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
static Vec< T, SIMD_WIDTH > sub(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's.
Definition base.H:388
static Vec< T, SIMD_WIDTH > subs(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's using saturated arithmetic.
Definition base.H:405
static Vec< T, SIMD_WIDTH > avg(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vec's, rounded up.
Definition base.H:456
static Vec< T, SIMD_WIDTH > div2rd(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds down the result.
Definition ext.H:1776
static Vec< T, SIMD_WIDTH > adds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's using saturated arithmetic.
Definition base.H:374
static Vec< T, SIMD_WIDTH > div2r0(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds the result to 0.
Definition ext.H:1696
static Vec< T, SIMD_WIDTH > div(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Divides the elements of two Vec's.
Definition base.H:439
static Vec< T, SIMD_WIDTH > add(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's.
Definition base.H:357
static Vec< T, SIMD_WIDTH > mul(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Multiplies the elements of two Vec's.
Definition base.H:421
static Vec< T, SIMD_WIDTH > cmplt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than ( < ).
Definition base.H:924
static bool test_all_ones(const Vec< T, SIMD_WIDTH > &a)
Tests if all bits of a Vec are one.
Definition base.H:1054
static Vec< T, SIMD_WIDTH > cmple(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than-or-equal ( <= ).
Definition base.H:945
static bool test_all_zeros(const Vec< T, SIMD_WIDTH > &a)
Tests if all bits of a Vec are zero.
Definition base.H:1042
static Vec< T, SIMD_WIDTH > cmpneq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for inequality ( != ).
Definition base.H:1029
static Vec< T, SIMD_WIDTH > cmpge(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than-or-equal ( >= ).
Definition base.H:987
static Vec< T, SIMD_WIDTH > cmpgt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than ( > ).
Definition base.H:1008
static Vec< T, SIMD_WIDTH > cmpeq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for equality ( == ).
Definition base.H:966
static Vec< T, SIMD_WIDTH > ifelse(const Vec< Tcond, SIMD_WIDTH > &cond, const Vec< T, SIMD_WIDTH > &trueVal, const Vec< T, SIMD_WIDTH > &falseVal)
Selects elements from two Vec's based on a condition Vec.
Definition base.H:126
static Vec< T, SIMD_WIDTH > slle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec left by a constant number of elements, shifting in zero elements.
Definition base.H:1353
static Vec< T, SIMD_WIDTH > srle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec right by a constant number of elements, shifting in zero elements.
Definition base.H:1338
static Vec< T, SIMD_WIDTH > hsubs(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally subtracts adjacent elements of two Vec's with saturation.
Definition base.H:523
static Vec< T, SIMD_WIDTH > hadds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's with saturation.
Definition base.H:493
static Vec< T, SIMD_WIDTH > hadd(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's.
Definition base.H:477
static Vec< T, SIMD_WIDTH > hsub(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally subtracts adjacent elements of two Vec's.
Definition base.H:507
static Vec< T, SIMD_WIDTH > int2bits(const uint64_t a)
Sets all bits of each element of a Vec to the corresponding bit of an integer.
Definition base.H:190
static uint64_t msb2int(const Vec< T, SIMD_WIDTH > &a)
Collects the most significant bit of each element of a Vec into an integer.
Definition base.H:147
static Vec< T, SIMD_WIDTH > setzero()
Returns a Vec with all elements set to zero.
Definition base.H:70
static Vec< T, SIMD_WIDTH > set1(const dont_deduce< T > a)
Returns a Vec with all elements set to the same value.
Definition base.H:88
static Vec< T, SIMD_WIDTH > bit_andnot(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise ANDNOT of two Vec's.
Definition base.H:762
static Vec< T, SIMD_WIDTH > bit_and(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise AND of two Vec's.
Definition base.H:732
static Vec< T, SIMD_WIDTH > bit_xor(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise XOR of two Vec's.
Definition base.H:776
static Vec< T, SIMD_WIDTH > bit_or(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise OR of two Vec's.
Definition base.H:746
static Vec< T, SIMD_WIDTH > bit_not(const Vec< T, SIMD_WIDTH > &a)
Computes the bitwise NOT of a Vec.
Definition base.H:789
static Vec< T, SIMD_WIDTH > sqrt(const Vec< T, SIMD_WIDTH > &a)
Computes the square root of the elements of a Vec.
Definition base.H:584
static Vec< T, SIMD_WIDTH > rcp(const Vec< T, SIMD_WIDTH > &a)
Computes the approximate reciprocal of the elements of a Vec.
Definition base.H:547
static Vec< T, SIMD_WIDTH > rsqrt(const Vec< T, SIMD_WIDTH > &a)
Computes the approximate reciprocal square root of the elements of a Vec.
Definition base.H:565
static Vec< T, SIMD_WIDTH > truncate(const Vec< T, SIMD_WIDTH > &a)
Truncates the elements of a Vec to the nearest integer i.e. rounds towards zero.
Definition base.H:712
static Vec< T, SIMD_WIDTH > min(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the minimum of the elements of two Vec's.
Definition base.H:606
static Vec< T, SIMD_WIDTH > floor(const Vec< T, SIMD_WIDTH > &a)
Rounds the elements of a Vec down to the nearest integer.
Definition base.H:683
static Vec< T, SIMD_WIDTH > max(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the maximum of the elements of two Vec's.
Definition base.H:620
static Vec< T, SIMD_WIDTH > ceil(const Vec< T, SIMD_WIDTH > &a)
Rounds the elements of a Vec up to the nearest integer.
Definition base.H:668
static Vec< T, SIMD_WIDTH > neg(const Vec< T, SIMD_WIDTH > &a)
Negates the elements of a Vec.
Definition base.H:635
static Vec< T, SIMD_WIDTH > round(const Vec< T, SIMD_WIDTH > &a)
Rounds the elements of a Vec to the nearest integer.
Definition base.H:697
static Vec< T, SIMD_WIDTH > abs(const Vec< T, SIMD_WIDTH > &a)
Computes the absolute value of the elements of a Vec.
Definition base.H:654
static Vec< T, SIMD_WIDTH > load(const T *const p)
Loads a Vec from aligned memory.
Definition base.H:209
static Vec< T, SIMD_WIDTH > loadu(const T *const p)
Loads a Vec from unaligned memory.
Definition base.H:231
static void store(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to aligned memory.
Definition base.H:246
static void storeu(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to unaligned memory.
Definition base.H:265
static Vec< T, SIMD_WIDTH > srli(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in zeros.
Definition base.H:828
static Vec< T, SIMD_WIDTH > slli(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec left by a constant number of bits while shifting in zeros.
Definition base.H:844
static Vec< T, SIMD_WIDTH > srai(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in the sign bit.
Definition base.H:812
static Vec< Tout, SIMD_WIDTH > cvts(const Vec< Tin, SIMD_WIDTH > &a)
Converts the elements of a Vec between integer and floating point types of the same size.
Definition base.H:1445
static Vec< Tout, SIMD_WIDTH > reinterpret(const Vec< Tin, SIMD_WIDTH > &a)
Reinterprets a given Vec as a Vec with a different element type.
Definition base.H:58
Namespace for T-SIMD.
Definition time_measurement.H:161
static constexpr T trueval()
Returns a value where all bits are 1.
Definition types.H:319