T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
mask_impl_emu.H
1// ===========================================================================
2//
3// emulated mask functions
4// Author: Markus Vieth (Bielefeld University, mvieth@techfak.uni-bielefeld.de)
5// Year of creation: 2019
6//
7// This source code file is part of the following software:
8//
9// - the low-level C++ template SIMD library
10// - the SIMD implementation of the MinWarping and the 2D-Warping methods
11// for local visual homing.
12//
13// The software is provided based on the accompanying license agreement in the
14// file LICENSE.md.
15// The software is provided "as is" without any warranty by the licensor and
16// without any liability of the licensor, and the software may not be
17// distributed by the licensee; see the license agreement for details.
18//
19// (C) Markus Vieth, Ralf Möller
20// Computer Engineering
21// Faculty of Technology
22// Bielefeld University
23// www.ti.uni-bielefeld.de
24//
25// ===========================================================================
26
27// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
28// namespace
29
30// 01. Feb 23 (Jonas Keller): implemented the emulated mask functions in a more
31// efficient way, described below and also optimized other small things.
32// 30. Nov 22 (Jonas Keller):
33// NOTE:
34// The float versions of the emulated mask functions in this file as well as in
35// SIMDVecMaskImplIntel64.H are not as fast as they could be, as they are
36// implemented such that they match the not emulated ones in flag and exception
37// behavior as well. This is done by masking the inputs of the masked functions,
38// which for example leads to the following code for masked addition:
39/*
40template <size_t SIMD_WIDTH>
41static SIMD_INLINE Vec<Float, SIMD_WIDTH>
42maskz_add(const Mask<Float, SIMD_WIDTH> &k,
43 const Vec<Float, SIMD_WIDTH> &a,
44 const Vec<Float, SIMD_WIDTH> &b)
45{
46 return add(mask_ifelse(k, a, setzero<Float, SIMD_WIDTH>()),
47 mask_ifelse(k, b, setzero<Float, SIMD_WIDTH>()));
48}
49template <size_t SIMD_WIDTH>
50static SIMD_INLINE Vec<Float, SIMD_WIDTH>
51mask_add(const Vec<Float, SIMD_WIDTH> &src,
52 const Mask<Float, SIMD_WIDTH> &k,
53 const Vec<Float, SIMD_WIDTH> &a,
54 const Vec<Float, SIMD_WIDTH> &b)
55{
56 return mask_ifelse(k, maskz_add(k, a, b), src);
57}
58*/
59// which calls mask_ifelse 3 times for one call of mask_add, instead of
60/*
61template <size_t SIMD_WIDTH>
62static SIMD_INLINE Vec<Float, SIMD_WIDTH>
63maskz_add(const Mask<Float, SIMD_WIDTH> &k,
64 const Vec<Float, SIMD_WIDTH> &a,
65 const Vec<Float, SIMD_WIDTH> &b)
66{
67 return mask_ifelse(k, add(a, b), setzero<Float, SIMD_WIDTH>());
68}
69template <size_t SIMD_WIDTH>
70static SIMD_INLINE Vec<Float, SIMD_WIDTH>
71mask_add(const Vec<Float, SIMD_WIDTH> &src,
72 const Mask<Float, SIMD_WIDTH> &k,
73 const Vec<Float, SIMD_WIDTH> &a,
74 const Vec<Float, SIMD_WIDTH> &b)
75{
76 return mask_ifelse(k, add(a, b), k);
77}
78*/
79// which calls mask_ifelse only once for one call of mask_add.
80//
81// The second version would however for example set the denormal flag if an
82// input is denormalized, even if the corresponding mask bit is not set, which
83// is different from the behavior of the not emulated mask functions.
84//
85// It may be worth considering to implement the emulated mask functions
86// analogous to the second version to improve performance. This would
87// change the flag/exception behavior of the emulated mask functions.
88// However, the flag/exception behavior is probably not correct in the
89// whole library anyway, and probably also different on ARM.
90// Additionally, the T-SIMD does not provide an architecture independent
91// way to use flags or exceptions, so emulating them does not make much
92// sense anyway.
93
94#pragma once
95#ifndef SIMD_VEC_MASK_IMPL_EMU_H_
96#define SIMD_VEC_MASK_IMPL_EMU_H_
97
98#include "alloc.H"
99#include "base.H"
100#include "defs.H"
101#include "intel/base_impl_intel64.H"
102#include "types.H"
103#include "vec.H"
104
105#include <cstddef>
106#include <cstdint>
107
108#ifndef SIMDVEC_SANDBOX
109
110namespace simd {
111// 05. Feb 23 (Jonas Keller): introduced generic emulated Mask class using the
112// Vec class
113
114// exclude from doxygen (until endcond)
116
117template <typename T, size_t SIMD_WIDTH>
118class Mask
119{
120 Vec<T, SIMD_WIDTH> mask;
121
122public:
123 Mask() = default;
124 explicit SIMD_INLINE Mask(const Vec<T, SIMD_WIDTH> &x) : mask(x) {}
125 SIMD_INLINE Mask(const uint64_t x) : mask(int2bits<T, SIMD_WIDTH>(x)) {}
126 explicit SIMD_INLINE operator Vec<T, SIMD_WIDTH>() const { return mask; }
127 SIMD_INLINE operator uint64_t() const { return msb2int<T, SIMD_WIDTH>(mask); }
128 SIMD_INLINE bool operator[](const size_t i) const
129 {
130 if (i >= Vec<T, SIMD_WIDTH>::elems) { return false; }
131 T mask_array[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH) = {0};
132 store(mask_array, mask);
133 return mask_array[i] != T(0);
134 }
135 SIMD_INLINE bool operator==(const Mask<T, SIMD_WIDTH> &other) const
136 {
137 return test_all_zeros(
138 bit_xor(reinterpret<Int>(mask), reinterpret<Int>(other.mask)));
139 }
140 // define operators new and delete to ensure proper alignment, since
141 // the default new and delete are not guaranteed to do so before C++17
142 void *operator new(size_t size) { return aligned_malloc(SIMD_WIDTH, size); }
143 void operator delete(void *p) { aligned_free(p); }
144 void *operator new[](size_t size) { return aligned_malloc(SIMD_WIDTH, size); }
145 void operator delete[](void *p) { aligned_free(p); }
146};
148
149namespace internal {
150namespace mask {
151#define EMULATE_SOP_NAME(OP, OP_NAME) \
152 template <typename T, size_t SIMD_WIDTH> \
153 static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_##OP_NAME( \
154 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a) \
155 { \
156 return mask::mask_ifelsezero(k, OP(a)); \
157 } \
158 template <typename T, size_t SIMD_WIDTH> \
159 static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_##OP_NAME( \
160 const Vec<T, SIMD_WIDTH> &src, const Mask<T, SIMD_WIDTH> &k, \
161 const Vec<T, SIMD_WIDTH> &a) \
162 { \
163 return mask::mask_ifelse(k, OP(a), src); \
164 }
165
166#define EMULATE_SOP(OP) EMULATE_SOP_NAME(OP, OP)
167
168#define EMULATE_DOP_NAME(OP, OP_NAME) \
169 template <typename T, size_t SIMD_WIDTH> \
170 static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_##OP_NAME( \
171 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a, \
172 const Vec<T, SIMD_WIDTH> &b) \
173 { \
174 return mask::mask_ifelsezero(k, OP(a, b)); \
175 } \
176 template <typename T, size_t SIMD_WIDTH> \
177 static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_##OP_NAME( \
178 const Vec<T, SIMD_WIDTH> &src, const Mask<T, SIMD_WIDTH> &k, \
179 const Vec<T, SIMD_WIDTH> &a, const Vec<T, SIMD_WIDTH> &b) \
180 { \
181 return mask::mask_ifelse(k, OP(a, b), src); \
182 }
183
184#define EMULATE_DOP(OP) EMULATE_DOP_NAME(OP, OP)
185
186template <typename T, size_t SIMD_WIDTH>
187static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_ifelse(
188 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &trueVal,
189 const Vec<T, SIMD_WIDTH> &falseVal)
190{
191 return ifelse((Vec<T, SIMD_WIDTH>) k, trueVal, falseVal);
192}
193
194// 04. Aug 22 (Jonas Keller): added mask_ifelsezero
195template <typename T, size_t SIMD_WIDTH>
196static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_ifelsezero(
197 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &trueVal)
198{
199 return bit_and((Vec<T, SIMD_WIDTH>) k, trueVal);
200}
201
202template <typename Tout, typename Tin, size_t SIMD_WIDTH>
203static SIMD_INLINE Mask<Tout, SIMD_WIDTH> reinterpret_mask(
204 const Mask<Tin, SIMD_WIDTH> &k)
205{
206 static_assert(sizeof(Tout) == sizeof(Tin), "");
207 return Mask<Tout, SIMD_WIDTH>(reinterpret<Tout>((Vec<Tin, SIMD_WIDTH>) k));
208}
209
210// The types of the masks are kind of arbitrary
211template <size_t SIMD_WIDTH>
212SIMD_INLINE Vec<Int, SIMD_WIDTH> maskz_cvts(const Mask<Float, SIMD_WIDTH> &k,
213 const Vec<Float, SIMD_WIDTH> &a)
214{
215 return mask::mask_ifelsezero(mask::reinterpret_mask<Int>(k),
217}
218
219template <size_t SIMD_WIDTH>
220SIMD_INLINE Vec<Int, SIMD_WIDTH> mask_cvts(const Vec<Int, SIMD_WIDTH> &src,
221 const Mask<Float, SIMD_WIDTH> &k,
222 const Vec<Float, SIMD_WIDTH> &a)
223{
224 return mask::mask_ifelse(mask::reinterpret_mask<Int>(k), ::simd::cvts<Int>(a),
225 src);
226}
227
228template <size_t SIMD_WIDTH>
229SIMD_INLINE Vec<Float, SIMD_WIDTH> maskz_cvts(const Mask<Int, SIMD_WIDTH> &k,
230 const Vec<Int, SIMD_WIDTH> &a)
231{
232 return mask::mask_ifelsezero(mask::reinterpret_mask<Float>(k),
234}
235
236template <size_t SIMD_WIDTH>
237SIMD_INLINE Vec<Float, SIMD_WIDTH> mask_cvts(const Vec<Float, SIMD_WIDTH> &src,
238 const Mask<Int, SIMD_WIDTH> &k,
239 const Vec<Int, SIMD_WIDTH> &a)
240{
241 return mask::mask_ifelse(mask::reinterpret_mask<Float>(k),
242 ::simd::cvts<Float>(a), src);
243}
244
245// =======================================================================
246// emulated load/store
247// =======================================================================
248
249// 04. Feb 23 (Jonas Keller): improved implementation of masked load/store
250// functions
251
252template <size_t SIMD_WIDTH, typename T>
253static SIMD_INLINE bool is_within_same_page(const T *const p)
254{
255 const uintptr_t PAGE_SIZE = 4096; // smallest page size I found
256 const uintptr_t begin_page =
257 reinterpret_cast<uintptr_t>(p) & ~(PAGE_SIZE - 1);
258 // 29. Aug 23 (Jonas Keller): fixed wrong calculation of end_page
259 const uintptr_t end_page =
260 // reinterpret_cast<uintptr_t>(p + Vec<T, SIMD_WIDTH>::elems - 1) &
261 // ~(PAGE_SIZE - 1);
262 (reinterpret_cast<uintptr_t>(p) + SIMD_WIDTH - 1) & ~(PAGE_SIZE - 1);
263 return begin_page == end_page;
264}
265
266template <typename T, size_t SIMD_WIDTH>
267static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_load(const Mask<T, SIMD_WIDTH> &k,
268 const T *const p)
269{
270 // if k is all zeros nothing should be loaded
271 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) {
272 return setzero<T, SIMD_WIDTH>();
273 }
274 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
275 // there is no risk of a page fault, so we load the whole vector and mask
276 // it. Otherwise, we load the vector element-wise.
277 if (is_within_same_page<SIMD_WIDTH>(p)) {
278 return mask::mask_ifelsezero(k, load<SIMD_WIDTH>(p));
279 }
280 // if k is all ones, we can load the whole vector
281 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return load<SIMD_WIDTH>(p); }
282 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
283 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
284 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH) = {0};
285 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
286 if (k_arr[i] != T(0)) { result[i] = p[i]; }
287 }
288 return load<SIMD_WIDTH>(result);
289}
290
291template <typename T, size_t SIMD_WIDTH>
292static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_load(const Vec<T, SIMD_WIDTH> &src,
293 const Mask<T, SIMD_WIDTH> &k,
294 const T *const p)
295{
296 // if k is all zeros nothing should be loaded
297 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return src; }
298 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
299 // there is no risk of a page fault, so we load the whole vector and mask
300 // it. Otherwise, we load the vector element-wise.
301 if (is_within_same_page<SIMD_WIDTH>(p)) {
302 return mask::mask_ifelse(k, load<SIMD_WIDTH>(p), src);
303 }
304 // if k is all ones, we can load the whole vector
305 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return load<SIMD_WIDTH>(p); }
306 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
307 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
308 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
309 store(result, src);
310 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
311 if (k_arr[i] != T(0)) { result[i] = p[i]; }
312 }
313 return load<SIMD_WIDTH>(result);
314}
315
316template <typename T, size_t SIMD_WIDTH>
317static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_loadu(const Mask<T, SIMD_WIDTH> &k,
318 const T *const p)
319{
320 // if k is all zeros nothing should be loaded
321 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) {
322 return setzero<T, SIMD_WIDTH>();
323 }
324 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
325 // there is no risk of a page fault, so we load the whole vector and mask
326 // it. Otherwise, we load the vector element-wise.
327 if (is_within_same_page<SIMD_WIDTH>(p)) {
328 return mask::mask_ifelsezero(k, loadu<SIMD_WIDTH>(p));
329 }
330 // if k is all ones, we can load the whole vector
331 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return loadu<SIMD_WIDTH>(p); }
332 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
333 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
334 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH) = {0};
335 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
336 if (k_arr[i] != T(0)) { result[i] = p[i]; }
337 }
338 return load<SIMD_WIDTH>(result);
339}
340
341template <typename T, size_t SIMD_WIDTH>
342static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_loadu(const Vec<T, SIMD_WIDTH> &src,
343 const Mask<T, SIMD_WIDTH> &k,
344 const T *const p)
345{
346 // if k is all zeros nothing should be loaded
347 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return src; }
348 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
349 // there is no risk of a page fault, so we load the whole vector and mask
350 // it. Otherwise, we load the vector element-wise.
351 if (is_within_same_page<SIMD_WIDTH>(p)) {
352 return mask::mask_ifelse(k, loadu<SIMD_WIDTH>(p), src);
353 }
354 // if k is all ones, we can load the whole vector
355 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) { return loadu<SIMD_WIDTH>(p); }
356 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
357 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
358 T result[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
359 store(result, src);
360 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
361 if (k_arr[i] != T(0)) { result[i] = p[i]; }
362 }
363 return load<SIMD_WIDTH>(result);
364}
365
366template <typename T, size_t SIMD_WIDTH>
367static SIMD_INLINE void mask_store(T *const p, const Mask<T, SIMD_WIDTH> &k,
368 const Vec<T, SIMD_WIDTH> &a)
369{
370 // if k is all zeros nothing should be stored
371 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return; }
372 // if k is all ones, we can store the whole vector
373 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) {
374 store(p, a);
375 return;
376 }
377 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
378 // there is no risk of a page fault, so we load the whole vector, mask it
379 // and store it back. Otherwise, we store the vector element-wise.
380 if (is_within_same_page<SIMD_WIDTH>(p)) {
381 store(p, mask::mask_ifelse(k, a, load<SIMD_WIDTH>(p)));
382 return;
383 }
384 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
385 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
386 T a_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
387 store(a_arr, a);
388 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
389 if (k_arr[i] != T(0)) { p[i] = a_arr[i]; }
390 }
391}
392
393template <typename T, size_t SIMD_WIDTH>
394static SIMD_INLINE void mask_storeu(T *const p, const Mask<T, SIMD_WIDTH> &k,
395 const Vec<T, SIMD_WIDTH> &a)
396{
397 // if k is all zeros nothing should be stored
398 if (test_all_zeros((Vec<T, SIMD_WIDTH>) k)) { return; }
399 // if k is all ones, we can store the whole vector
400 if (test_all_ones((Vec<T, SIMD_WIDTH>) k)) {
401 storeu(p, a);
402 return;
403 }
404 // If p till p+Vec<T, SIMD_WIDTH>::elems-1 is within the same page,
405 // there is no risk of a page fault, so we load the whole vector, mask it
406 // and store it back. Otherwise, we store the vector element-wise.
407 if (is_within_same_page<SIMD_WIDTH>(p)) {
408 storeu(p, mask::mask_ifelse(k, a, loadu<SIMD_WIDTH>(p)));
409 return;
410 }
411 T k_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
412 store(k_arr, (Vec<T, SIMD_WIDTH>) k);
413 T a_arr[Vec<T, SIMD_WIDTH>::elems] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
414 store(a_arr, a);
415 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
416 if (k_arr[i] != T(0)) { p[i] = a_arr[i]; }
417 }
418}
419
420// maskz_store(u) does not exist/does not make sense
421
422template <typename T, size_t SIMD_WIDTH>
423static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_set1(const Mask<T, SIMD_WIDTH> &k,
424 const T a)
425{
426 return mask::mask_ifelsezero(k, ::simd::set1<T, SIMD_WIDTH>(a));
427}
428template <typename T, size_t SIMD_WIDTH>
429static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_set1(const Vec<T, SIMD_WIDTH> &src,
430 const Mask<T, SIMD_WIDTH> &k,
431 const T a)
432{
433 return mask::mask_ifelse(k, ::simd::set1<T, SIMD_WIDTH>(a), src);
434}
435
436EMULATE_DOP(add)
437EMULATE_DOP(adds)
438EMULATE_DOP(sub)
439EMULATE_DOP(subs)
440
441EMULATE_DOP(mul)
442EMULATE_DOP(div)
443
444// ---------------------------------------------------------------------------
445// masked ceil, floor, round, truncate v
446// ---------------------------------------------------------------------------
447
448EMULATE_SOP(ceil)
449EMULATE_SOP(floor)
450EMULATE_SOP(round)
451EMULATE_SOP(truncate)
452
453// ---------------------------------------------------------------------------
454// masked elementary mathematical functions v
455// ---------------------------------------------------------------------------
456
457EMULATE_SOP(rcp)
458EMULATE_SOP(rsqrt)
459EMULATE_SOP(sqrt)
460
461EMULATE_SOP(abs)
462
463EMULATE_DOP_NAME(bit_and, and)
464EMULATE_DOP_NAME(bit_or, or)
465EMULATE_DOP_NAME(bit_andnot, andnot)
466EMULATE_DOP_NAME(bit_xor, xor)
467EMULATE_SOP_NAME(bit_not, not )
468EMULATE_SOP(neg)
469EMULATE_DOP(min)
470EMULATE_DOP(max)
471EMULATE_SOP(div2r0)
472EMULATE_SOP(div2rd)
473
474#define EMULATE_SHIFT(OP) \
475 template <size_t COUNT, typename T, size_t SIMD_WIDTH> \
476 static SIMD_INLINE Vec<T, SIMD_WIDTH> maskz_##OP( \
477 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a) \
478 { \
479 return mask::mask_ifelsezero(k, OP<COUNT>(a)); \
480 } \
481 template <size_t COUNT, typename T, size_t SIMD_WIDTH> \
482 static SIMD_INLINE Vec<T, SIMD_WIDTH> mask_##OP( \
483 const Vec<T, SIMD_WIDTH> &src, const Mask<T, SIMD_WIDTH> &k, \
484 const Vec<T, SIMD_WIDTH> &a) \
485 { \
486 return mask::mask_ifelse(k, OP<COUNT>(a), src); \
487 }
488EMULATE_SHIFT(srai)
489EMULATE_SHIFT(srli)
490EMULATE_SHIFT(slli)
491
492EMULATE_DOP(hadd)
493EMULATE_DOP(hadds)
494EMULATE_DOP(hsub)
495EMULATE_DOP(hsubs)
496
497// TODO mask parameters?
498
499// 16. Oct 22 (Jonas Keller): added overloaded versions of mask_cmp* functions
500// that only take two vector parameters and no mask parameter
501#define EMULATE_CMP(OP) \
502 template <typename T, size_t SIMD_WIDTH> \
503 static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_##OP( \
504 const Mask<T, SIMD_WIDTH> &k, const Vec<T, SIMD_WIDTH> &a, \
505 const Vec<T, SIMD_WIDTH> &b) \
506 { \
507 return Mask<T, SIMD_WIDTH>(mask::mask_ifelsezero(k, OP(a, b))); \
508 } \
509 template <typename T, size_t SIMD_WIDTH> \
510 static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_##OP( \
511 const Vec<T, SIMD_WIDTH> &a, const Vec<T, SIMD_WIDTH> &b) \
512 { \
513 return Mask<T, SIMD_WIDTH>(OP(a, b)); \
514 }
515
516EMULATE_CMP(cmplt)
517EMULATE_CMP(cmple)
518EMULATE_CMP(cmpeq)
519EMULATE_CMP(cmpgt)
520EMULATE_CMP(cmpge)
521EMULATE_CMP(cmpneq)
522
523EMULATE_DOP(avg)
524
525template <typename T, size_t SIMD_WIDTH>
526static SIMD_INLINE bool mask_test_all_zeros(const Mask<T, SIMD_WIDTH> &k,
527 const Vec<T, SIMD_WIDTH> &a)
528{
529 return test_all_zeros(mask::mask_ifelsezero(k, a));
530}
531
532template <typename T, size_t SIMD_WIDTH>
533static SIMD_INLINE bool mask_test_all_ones(const Mask<T, SIMD_WIDTH> &k,
534 const Vec<T, SIMD_WIDTH> &a)
535{
536 return mask::mask_test_all_zeros(
537 k, bit_not(a)); // test_all_ones(mask_ifelse<T, SIMD_WIDTH>(k, a, ()
538 // set1<Byte, SIMD_WIDTH>(0xFF)));
539}
540
541template <typename T, size_t SIMD_WIDTH>
542static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_all_ones(OutputType<T>,
543 Integer<SIMD_WIDTH>)
544{
545 return (Mask<T, SIMD_WIDTH>) set1<T, SIMD_WIDTH>(TypeInfo<T>::trueval());
546}
547
548#define EMULATE_DMASKOP(NAME) \
549 template <typename T, size_t SIMD_WIDTH> \
550 static SIMD_INLINE Mask<T, SIMD_WIDTH> k##NAME(const Mask<T, SIMD_WIDTH> &a, \
551 const Mask<T, SIMD_WIDTH> &b) \
552 { \
553 return (Mask<T, SIMD_WIDTH>) NAME##_((Vec<T, SIMD_WIDTH>) a, \
554 (Vec<T, SIMD_WIDTH>) b); \
555 }
556
557EMULATE_DMASKOP(and)
558
559// EMULATE_DMASKOP(andn)
560// function name should be "kandn" but the vector function is "bit_andnot"
561template <typename T, size_t SIMD_WIDTH>
562static SIMD_INLINE Mask<T, SIMD_WIDTH> kandn(const Mask<T, SIMD_WIDTH> &a,
563 const Mask<T, SIMD_WIDTH> &b)
564{
565 return (Mask<T, SIMD_WIDTH>) bit_andnot((Vec<T, SIMD_WIDTH>) a,
566 (Vec<T, SIMD_WIDTH>) b);
567}
568
569EMULATE_DMASKOP(or)
570EMULATE_DMASKOP(xor)
571
572// EMULATE_DMASKOP(xnor)
573// there is not xnor-function for vectors, so we have to do: bit_not(bit_xor(a,
574// b))
575template <typename T, size_t SIMD_WIDTH>
576static SIMD_INLINE Mask<T, SIMD_WIDTH> kxnor(const Mask<T, SIMD_WIDTH> &a,
577 const Mask<T, SIMD_WIDTH> &b)
578{
579 return (Mask<T, SIMD_WIDTH>) bit_not(
580 bit_xor((Vec<T, SIMD_WIDTH>) a, (Vec<T, SIMD_WIDTH>) b));
581}
582
583template <typename T, size_t SIMD_WIDTH>
584static SIMD_INLINE Mask<T, SIMD_WIDTH> kadd(const Mask<T, SIMD_WIDTH> &a,
585 const Mask<T, SIMD_WIDTH> &b)
586{
587 Mask<T, SIMD_WIDTH> ret;
588 ret = (((uintmax_t) a) + ((uintmax_t) b));
589 return ret;
590}
591
592template <typename T, size_t SIMD_WIDTH>
593static SIMD_INLINE Mask<T, SIMD_WIDTH> knot(const Mask<T, SIMD_WIDTH> &a)
594{
595 return (Mask<T, SIMD_WIDTH>) bit_not((Vec<T, SIMD_WIDTH>) a);
596}
597
598// shift with flexible parameter (not template), probably slower than
599// template-version
600// TODO faster implementation with switch-case possible?
601template <typename T, size_t SIMD_WIDTH>
602static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftri(const Mask<T, SIMD_WIDTH> &a,
603 uintmax_t count)
604{
605 // 04. Aug 22 (Jonas Keller):
606 // return zero if count is larger than sizeof(uintmax_t)*8 - 1, since then
607 // the >> operator is undefined, but kshift should return zero
608 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
609 if (count >= sizeof(uintmax_t) * 8) { return Mask<T, SIMD_WIDTH>(0); }
610 return (Mask<T, SIMD_WIDTH>) (((uintmax_t) a) >> count);
611}
612template <typename T, size_t SIMD_WIDTH>
613static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftli(const Mask<T, SIMD_WIDTH> &a,
614 uintmax_t count)
615{
616 // 04. Aug 22 (Jonas Keller):
617 // return zero if count is larger than sizeof(uintmax_t)*8 - 1, since then
618 // the << operator is undefined, but kshift should return zero
619 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
620 if (count >= sizeof(uintmax_t) * 8) { return Mask<T, SIMD_WIDTH>(0); }
621 return (Mask<T, SIMD_WIDTH>) (((uintmax_t) a) << count);
622}
623
624// shift with template parameter
625template <size_t COUNT, typename T, size_t SIMD_WIDTH>
626static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftri(const Mask<T, SIMD_WIDTH> &a)
627{
628 return (Mask<T, SIMD_WIDTH>) srle<COUNT>((Vec<T, SIMD_WIDTH>) a);
629}
630template <size_t COUNT, typename T, size_t SIMD_WIDTH>
631static SIMD_INLINE Mask<T, SIMD_WIDTH> kshiftli(const Mask<T, SIMD_WIDTH> &a)
632{
633 return (Mask<T, SIMD_WIDTH>) slle<COUNT>((Vec<T, SIMD_WIDTH>) a);
634}
635
636// 30. Jan 23 (Jonas Keller): removed setTrueLeft/Right and replaced them with
637// mask_set_true/false_low/high.
638
639template <bool UP, typename T, size_t SIMD_WIDTH>
640struct MaskSetBuffer
641{
642 T buffer[Vec<T, SIMD_WIDTH>::elems * 2];
643 MaskSetBuffer()
644 {
645 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elems; i++) {
646 buffer[i] = UP ? 0 : TypeInfo<T>::trueval();
647 }
648 for (size_t i = Vec<T, SIMD_WIDTH>::elems;
649 i < Vec<T, SIMD_WIDTH>::elems * 2; i++) {
650 buffer[i] = UP ? TypeInfo<T>::trueval() : 0;
651 }
652 }
653};
654
655template <typename T, size_t SIMD_WIDTH>
656static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_true_low(const size_t x,
657 OutputType<T>,
658 Integer<SIMD_WIDTH>)
659{
660 if (x >= Vec<T, SIMD_WIDTH>::elems) {
661 return mask_all_ones(OutputType<T>(), Integer<SIMD_WIDTH>());
662 }
663 static MaskSetBuffer<false, T, SIMD_WIDTH> buffer;
664 return Mask<T, SIMD_WIDTH>(
665 loadu<SIMD_WIDTH>(buffer.buffer + Vec<T, SIMD_WIDTH>::elems - x));
666}
667
668template <typename T, size_t SIMD_WIDTH>
669static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_true_high(const size_t x,
670 OutputType<T>,
671 Integer<SIMD_WIDTH>)
672{
673 if (x >= Vec<T, SIMD_WIDTH>::elems) {
674 return mask_all_ones(OutputType<T>(), Integer<SIMD_WIDTH>());
675 }
676 static MaskSetBuffer<true, T, SIMD_WIDTH> buffer;
677 return Mask<T, SIMD_WIDTH>(loadu<SIMD_WIDTH>(buffer.buffer + x));
678}
679
680template <typename T, size_t SIMD_WIDTH>
681static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_false_low(const size_t x,
682 OutputType<T>,
683 Integer<SIMD_WIDTH>)
684{
685 if (x >= Vec<T, SIMD_WIDTH>::elems) { return Mask<T, SIMD_WIDTH>(0); }
686 static MaskSetBuffer<true, T, SIMD_WIDTH> buffer;
687 return Mask<T, SIMD_WIDTH>(
688 loadu<SIMD_WIDTH>(buffer.buffer + Vec<T, SIMD_WIDTH>::elems - x));
689}
690
691template <typename T, size_t SIMD_WIDTH>
692static SIMD_INLINE Mask<T, SIMD_WIDTH> mask_set_false_high(const size_t x,
693 OutputType<T>,
694 Integer<SIMD_WIDTH>)
695{
696 if (x >= Vec<T, SIMD_WIDTH>::elems) { return Mask<T, SIMD_WIDTH>(0); }
697 static MaskSetBuffer<false, T, SIMD_WIDTH> buffer;
698 return Mask<T, SIMD_WIDTH>(loadu<SIMD_WIDTH>(buffer.buffer + x));
699}
700
701// 07. Aug 23 (Jonas Keller): added ktest_all_zeros/ones.
702
703template <typename T, size_t SIMD_WIDTH>
704static SIMD_INLINE bool ktest_all_zeros(const Mask<T, SIMD_WIDTH> &a)
705{
706 return test_all_zeros((Vec<T, SIMD_WIDTH>) a);
707}
708
709template <typename T, size_t SIMD_WIDTH>
710static SIMD_INLINE bool ktest_all_ones(const Mask<T, SIMD_WIDTH> &a)
711{
712 return test_all_ones((Vec<T, SIMD_WIDTH>) a);
713}
714
715// 07. Aug 23 (Jonas Keller): added kcmpeq
716
717template <typename T, size_t SIMD_WIDTH>
718static SIMD_INLINE bool kcmpeq(const Mask<T, SIMD_WIDTH> &a,
719 const Mask<T, SIMD_WIDTH> &b)
720{
721 return internal::mask::ktest_all_zeros(internal::mask::kxor(a, b));
722}
723
724} // namespace mask
725} // namespace internal
726} // namespace simd
727
728#endif // SIMDVEC_SANDBOX
729
730#endif // SIMD_VEC_MASK_IMPL_EMU_H_
bool operator[](const size_t i) const
Returns the Mask bit at the given index.
Mask(const Vec< T, SIMD_WIDTH > &x)
Constructs a Mask from a Vec.
bool operator==(const Mask< T, SIMD_WIDTH > &other) const
Compares the Mask with another Mask.
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
static Vec< T, SIMD_WIDTH > sub(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's.
Definition base.H:388
static Vec< T, SIMD_WIDTH > subs(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's using saturated arithmetic.
Definition base.H:405
static Vec< T, SIMD_WIDTH > avg(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vec's, rounded up.
Definition base.H:456
static Vec< T, SIMD_WIDTH > div2rd(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds down the result.
Definition ext.H:1776
static Vec< T, SIMD_WIDTH > adds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's using saturated arithmetic.
Definition base.H:374
static Vec< T, SIMD_WIDTH > div2r0(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds the result to 0.
Definition ext.H:1696
static Vec< T, SIMD_WIDTH > div(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Divides the elements of two Vec's.
Definition base.H:439
static Vec< T, SIMD_WIDTH > add(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's.
Definition base.H:357
static Vec< T, SIMD_WIDTH > mul(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Multiplies the elements of two Vec's.
Definition base.H:421
static Vec< T, SIMD_WIDTH > cmplt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than ( < ).
Definition base.H:924
static bool test_all_ones(const Vec< T, SIMD_WIDTH > &a)
Tests if all bits of a Vec are one.
Definition base.H:1054
static Vec< T, SIMD_WIDTH > cmple(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than-or-equal ( <= ).
Definition base.H:945
static bool test_all_zeros(const Vec< T, SIMD_WIDTH > &a)
Tests if all bits of a Vec are zero.
Definition base.H:1042
static Vec< T, SIMD_WIDTH > cmpneq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for inequality ( != ).
Definition base.H:1029
static Vec< T, SIMD_WIDTH > cmpge(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than-or-equal ( >= ).
Definition base.H:987
static Vec< T, SIMD_WIDTH > cmpgt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than ( > ).
Definition base.H:1008
static Vec< T, SIMD_WIDTH > cmpeq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for equality ( == ).
Definition base.H:966
static Vec< T, SIMD_WIDTH > ifelse(const Vec< Tcond, SIMD_WIDTH > &cond, const Vec< T, SIMD_WIDTH > &trueVal, const Vec< T, SIMD_WIDTH > &falseVal)
Selects elements from two Vec's based on a condition Vec.
Definition base.H:126
static Vec< T, SIMD_WIDTH > slle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec left by a constant number of elements, shifting in zero elements.
Definition base.H:1353
static Vec< T, SIMD_WIDTH > srle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec right by a constant number of elements, shifting in zero elements.
Definition base.H:1338
static Vec< T, SIMD_WIDTH > hsubs(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally subtracts adjacent elements of two Vec's with saturation.
Definition base.H:523
static Vec< T, SIMD_WIDTH > hadds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's with saturation.
Definition base.H:493
static Vec< T, SIMD_WIDTH > hadd(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's.
Definition base.H:477
static Vec< T, SIMD_WIDTH > hsub(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally subtracts adjacent elements of two Vec's.
Definition base.H:507
static Vec< T, SIMD_WIDTH > int2bits(const uint64_t a)
Sets all bits of each element of a Vec to the corresponding bit of an integer.
Definition base.H:190
static uint64_t msb2int(const Vec< T, SIMD_WIDTH > &a)
Collects the most significant bit of each element of a Vec into an integer.
Definition base.H:147
static Vec< T, SIMD_WIDTH > setzero()
Returns a Vec with all elements set to zero.
Definition base.H:70
static Vec< T, SIMD_WIDTH > set1(const dont_deduce< T > a)
Returns a Vec with all elements set to the same value.
Definition base.H:88
static Vec< T, SIMD_WIDTH > bit_andnot(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise ANDNOT of two Vec's.
Definition base.H:762
static Vec< T, SIMD_WIDTH > bit_and(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise AND of two Vec's.
Definition base.H:732
static Vec< T, SIMD_WIDTH > bit_xor(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise XOR of two Vec's.
Definition base.H:776
static Vec< T, SIMD_WIDTH > bit_or(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise OR of two Vec's.
Definition base.H:746
static Vec< T, SIMD_WIDTH > bit_not(const Vec< T, SIMD_WIDTH > &a)
Computes the bitwise NOT of a Vec.
Definition base.H:789
static Vec< T, SIMD_WIDTH > sqrt(const Vec< T, SIMD_WIDTH > &a)
Computes the square root of the elements of a Vec.
Definition base.H:584
static Vec< T, SIMD_WIDTH > rcp(const Vec< T, SIMD_WIDTH > &a)
Computes the approximate reciprocal of the elements of a Vec.
Definition base.H:547
static Vec< T, SIMD_WIDTH > rsqrt(const Vec< T, SIMD_WIDTH > &a)
Computes the approximate reciprocal square root of the elements of a Vec.
Definition base.H:565
static Vec< T, SIMD_WIDTH > truncate(const Vec< T, SIMD_WIDTH > &a)
Truncates the elements of a Vec to the nearest integer i.e. rounds towards zero.
Definition base.H:712
static Vec< T, SIMD_WIDTH > min(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the minimum of the elements of two Vec's.
Definition base.H:606
static Vec< T, SIMD_WIDTH > floor(const Vec< T, SIMD_WIDTH > &a)
Rounds the elements of a Vec down to the nearest integer.
Definition base.H:683
static Vec< T, SIMD_WIDTH > max(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the maximum of the elements of two Vec's.
Definition base.H:620
static Vec< T, SIMD_WIDTH > ceil(const Vec< T, SIMD_WIDTH > &a)
Rounds the elements of a Vec up to the nearest integer.
Definition base.H:668
static Vec< T, SIMD_WIDTH > neg(const Vec< T, SIMD_WIDTH > &a)
Negates the elements of a Vec.
Definition base.H:635
static Vec< T, SIMD_WIDTH > round(const Vec< T, SIMD_WIDTH > &a)
Rounds the elements of a Vec to the nearest integer.
Definition base.H:697
static Vec< T, SIMD_WIDTH > abs(const Vec< T, SIMD_WIDTH > &a)
Computes the absolute value of the elements of a Vec.
Definition base.H:654
static Vec< T, SIMD_WIDTH > load(const T *const p)
Loads a Vec from aligned memory.
Definition base.H:209
static Vec< T, SIMD_WIDTH > loadu(const T *const p)
Loads a Vec from unaligned memory.
Definition base.H:231
static void store(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to aligned memory.
Definition base.H:246
static void storeu(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to unaligned memory.
Definition base.H:265
static Vec< T, SIMD_WIDTH > srli(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in zeros.
Definition base.H:828
static Vec< T, SIMD_WIDTH > slli(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec left by a constant number of bits while shifting in zeros.
Definition base.H:844
static Vec< T, SIMD_WIDTH > srai(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in the sign bit.
Definition base.H:812
static Vec< Tout, SIMD_WIDTH > cvts(const Vec< Tin, SIMD_WIDTH > &a)
Converts the elements of a Vec between integer and floating point types of the same size.
Definition base.H:1445
static Vec< Tout, SIMD_WIDTH > reinterpret(const Vec< Tin, SIMD_WIDTH > &a)
Reinterprets a given Vec as a Vec with a different element type.
Definition base.H:58
Namespace for T-SIMD.
Definition time_measurement.H:161
static constexpr T trueval()
Returns a value where all bits are 1.
Definition types.H:311