T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
mask_impl_intel64.H
1// ===========================================================================
2//
3// Mask class definitions and architecture specific functions
4// for Intel 64 byte (512 bit)
5// Author: Markus Vieth (Bielefeld University, mvieth@techfak.uni-bielefeld.de)
6// Year of creation: 2019
7//
8// This source code file is part of the following software:
9//
10// - the low-level C++ template SIMD library
11// - the SIMD implementation of the MinWarping and the 2D-Warping methods
12// for local visual homing.
13//
14// The software is provided based on the accompanying license agreement in the
15// file LICENSE.md.
16// The software is provided "as is" without any warranty by the licensor and
17// without any liability of the licensor, and the software may not be
18// distributed by the licensee; see the license agreement for details.
19//
20// (C) Markus Vieth, Ralf Möller
21// Computer Engineering
22// Faculty of Technology
23// Bielefeld University
24// www.ti.uni-bielefeld.de
25//
26// ===========================================================================
27
28// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
29// namespace
30
31#pragma once
32#ifndef SIMD_VEC_MASK_IMPL_INTEL_64_H_
33#define SIMD_VEC_MASK_IMPL_INTEL_64_H_
34
35#include "../base.H"
36#include "../defs.H"
37#include "../mask_impl_emu.H"
38#include "../types.H"
39#include "../vec.H"
40#include "base_impl_intel64.H"
41#include "intrins_intel.H"
42
43#include <algorithm>
44#include <cstdint>
45#include <cstring>
46#include <type_traits>
47
48#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_64_AVAIL_) && \
49 !defined(SIMDVEC_SANDBOX)
50
51namespace simd {
52#define CLASS_MASK(TYPE, MASK_SIZE) \
53 template <> \
54 class Mask<TYPE, 64> \
55 { \
56 __mmask##MASK_SIZE k; \
57 \
58 public: \
59 Mask() \
60 { \
61 k = 0; \
62 } \
63 SIMD_INLINE Mask(const __mmask##MASK_SIZE &x) \
64 { \
65 k = x; \
66 } \
67 explicit SIMD_INLINE Mask(const Vec<TYPE, 64> &x) \
68 { \
69 k = msb2int(x); \
70 } \
71 Mask &operator=(const __mmask##MASK_SIZE &x) \
72 { \
73 k = x; \
74 return *this; \
75 } \
76 SIMD_INLINE operator __mmask##MASK_SIZE() const \
77 { \
78 return k; \
79 } \
80 explicit SIMD_INLINE operator Vec<TYPE, 64>() const \
81 { \
82 return int2bits<TYPE, 64>(k); \
83 } \
84 SIMD_INLINE bool operator[](const uint8_t i) const \
85 { \
86 return ((1lu << i) & k) != 0; \
87 } \
88 SIMD_INLINE bool operator==(const Mask<TYPE, 64> &x) const \
89 { \
90 return k == x.k; \
91 } \
92 };
93
94#ifdef __AVX512BW__
95CLASS_MASK(Byte, 64)
96CLASS_MASK(SignedByte, 64)
97CLASS_MASK(Word, 32)
98CLASS_MASK(Short, 32)
99#endif
100CLASS_MASK(Int, 16)
101CLASS_MASK(Float, 16)
102CLASS_MASK(Long, 8)
103CLASS_MASK(Double, 8)
104
105namespace internal {
106namespace mask {
107#define MASK_SOP(OP, TYPE, SUF) \
108 static SIMD_INLINE Vec<TYPE, 64> mask_##OP( \
109 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a) \
110 { \
111 return _mm512_mask_##OP##_##SUF(src, k, a); \
112 }
113
114#define MASKZ_SOP(OP, TYPE, SUF) \
115 static SIMD_INLINE Vec<TYPE, 64> maskz_##OP(const Mask<TYPE, 64> &k, \
116 const Vec<TYPE, 64> &a) \
117 { \
118 return _mm512_maskz_##OP##_##SUF(k, a); \
119 }
120
121// For operations with one argument. OP is the name of the operation (e.g. add,
122// sub, mul), TYPE is the typename (e.g. Word, Float), and SUF is the
123// suffix of the intrinsic (e.g. epi8, epi16, ps).
124#define GENERATE_SOP(OP, TYPE, SUF) \
125 MASK_SOP(OP, TYPE, SUF) MASKZ_SOP(OP, TYPE, SUF)
126
127#define MASK_DOP(OP, TYPE, SUF) \
128 static SIMD_INLINE Vec<TYPE, 64> mask_##OP( \
129 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, \
130 const Vec<TYPE, 64> &b) \
131 { \
132 return _mm512_mask_##OP##_##SUF(src, k, a, b); \
133 }
134
135#define MASKZ_DOP(OP, TYPE, SUF) \
136 static SIMD_INLINE Vec<TYPE, 64> maskz_##OP( \
137 const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, const Vec<TYPE, 64> &b) \
138 { \
139 return _mm512_maskz_##OP##_##SUF(k, a, b); \
140 }
141
142// For operations with two arguments. OP is the name of the operation (e.g. add,
143// sub, mul), TYPE is the typename (e.g. Word, Float), and SUF is the
144// suffix of the intrinsic (e.g. epi8, epi16, ps).
145#define GENERATE_DOP(OP, TYPE, SUF) \
146 MASK_DOP(OP, TYPE, SUF) MASKZ_DOP(OP, TYPE, SUF)
147
148// ---------------------------------------------------------------------------
149// mask_ifelse v
150// ---------------------------------------------------------------------------
151
152// 29. Mar 23 (Jonas Keller): added explicit cast to __m512(i) register to avoid
153// compiler errors (can't convert simd::Vec to __v64qi, etc...)
154
155#define MASK_IFELSE(TYPE, SUF, REG) \
156 static SIMD_INLINE Vec<TYPE, 64> mask_ifelse(const Mask<TYPE, 64> &cond, \
157 const Vec<TYPE, 64> &a, \
158 const Vec<TYPE, 64> &b) \
159 { \
160 return (REG) _mm512_mask_blend_##SUF(cond, (REG) b, (REG) a); \
161 }
162
163#ifdef __AVX512BW__
164MASK_IFELSE(Byte, epi8, __m512i)
165MASK_IFELSE(SignedByte, epi8, __m512i)
166MASK_IFELSE(Word, epi16, __m512i)
167MASK_IFELSE(Short, epi16, __m512i)
168#endif
169MASK_IFELSE(Int, epi32, __m512i)
170MASK_IFELSE(Float, ps, __m512)
171MASK_IFELSE(Long, epi64, __m512i)
172MASK_IFELSE(Double, pd, __m512d)
173
174// ---------------------------------------------------------------------------
175// mask_ifelsezero (mask_ifelsezero(cond, a) is the same as mask_ifelse(cond, a,
176// setzero()), but may have faster implementations)
177// ---------------------------------------------------------------------------
178
179#define MASK_IFELSEZERO(TYPE) \
180 static SIMD_INLINE Vec<TYPE, 64> mask_ifelsezero( \
181 const Mask<TYPE, 64> &cond, const Vec<TYPE, 64> &trueVal) \
182 { \
183 return mask_ifelse(cond, trueVal, ::simd::setzero<TYPE, 64>()); \
184 }
185
186#ifdef __AVX512BW__
187MASK_IFELSEZERO(Byte)
188MASK_IFELSEZERO(SignedByte)
189MASK_IFELSEZERO(Word)
190MASK_IFELSEZERO(Short)
191#endif
192MASK_IFELSEZERO(Int)
193MASK_IFELSEZERO(Float)
194MASK_IFELSEZERO(Long)
195MASK_IFELSEZERO(Double)
196
197// ---------------------------------------------------------------------------
198// reinterpret_mask v
199// ---------------------------------------------------------------------------
200
201// 06. Feb 23 (Jonas Keller): added reinterpret_mask
202
203template <typename Tout, typename Tin>
204static SIMD_INLINE Mask<Tout, 64> reinterpret_mask(const Mask<Tin, 64> &k)
205{
206 static_assert(sizeof(Tout) == sizeof(Tin), "");
207 return Mask<Tout, 64>(k.k);
208}
209
210// ---------------------------------------------------------------------------
211// masked convert (without changes in the number of of elements) v
212// ---------------------------------------------------------------------------
213
214// conversion with saturation; we wanted to have a fast solution that
215// doesn't trigger the overflow which results in a negative two's
216// complement result ("invalid int32": 0x80000000); therefore we clamp
217// the positive values at the maximal positive float which is
218// convertible to int32 without overflow (0x7fffffbf = 2147483520);
219// negative values cannot overflow (they are clamped to invalid int
220// which is the most negative int32)
221SIMD_INLINE Vec<Int, 64> maskz_cvts(const Mask<Float, 64> &k,
222 const Vec<Float, 64> &a)
223{
224 __m512 clip = _mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
225 return _mm512_maskz_cvtps_epi32(k, _mm512_maskz_min_ps(k, clip, a));
226}
227
228SIMD_INLINE Vec<Int, 64> mask_cvts(const Vec<Int, 64> &src,
229 const Mask<Float, 64> &k,
230 const Vec<Float, 64> &a)
231{
232 __m512 clip = _mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
233 return _mm512_mask_cvtps_epi32(src, k, _mm512_maskz_min_ps(k, clip, a));
234}
235
236// saturation is not necessary in this case
237SIMD_INLINE Vec<Float, 64> maskz_cvts(const Mask<Int, 64> &k,
238 const Vec<Int, 64> &a)
239{
240 return _mm512_maskz_cvtepi32_ps(k, a);
241}
242
243// saturation is not necessary in this case
244SIMD_INLINE Vec<Float, 64> mask_cvts(const Vec<Float, 64> &src,
245 const Mask<Int, 64> &k,
246 const Vec<Int, 64> &a)
247{
248 return _mm512_mask_cvtepi32_ps(src, k, a);
249}
250
251// ---------------------------------------------------------------------------
252// mask_set1 v
253// ---------------------------------------------------------------------------
254
255#define GENERATE_SET1(TYPE, SUF) \
256 static SIMD_INLINE Vec<TYPE, 64> mask_set1( \
257 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const TYPE a) \
258 { \
259 return _mm512_mask_set1_##SUF(src, k, a); \
260 } \
261 static SIMD_INLINE Vec<TYPE, 64> maskz_set1(const Mask<TYPE, 64> &k, \
262 const TYPE a) \
263 { \
264 return _mm512_maskz_set1_##SUF(k, a); \
265 }
266
267#ifdef __AVX512BW__
268GENERATE_SET1(Byte, epi8)
269GENERATE_SET1(SignedByte, epi8)
270GENERATE_SET1(Word, epi16)
271GENERATE_SET1(Short, epi16)
272#endif
273GENERATE_SET1(Int, epi32)
274GENERATE_SET1(Long, epi64)
275// Workaround for Float, because there is no mask_set1_ps
276static SIMD_INLINE Vec<Float, 64> mask_set1(const Vec<Float, 64> &src,
277 const Mask<Float, 64> &k,
278 const Float a)
279{
280 return _mm512_castsi512_ps(
281 _mm512_mask_set1_epi32(_mm512_castps_si512(src), k, bit_cast<Int>(a)));
282}
283static SIMD_INLINE Vec<Float, 64> maskz_set1(const Mask<Float, 64> &k,
284 const Float a)
285{
286 return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(k, bit_cast<Int>(a)));
287}
288// Workaround for Double, because there is no mask_set1_pd
289static SIMD_INLINE Vec<Double, 64> mask_set1(const Vec<Double, 64> &src,
290 const Mask<Double, 64> &k,
291 const Double a)
292{
293 return _mm512_castsi512_pd(
294 _mm512_mask_set1_epi64(_mm512_castpd_si512(src), k, bit_cast<Long>(a)));
295}
296static SIMD_INLINE Vec<Double, 64> maskz_set1(const Mask<Double, 64> &k,
297 const Double a)
298{
299 return _mm512_castsi512_pd(_mm512_maskz_set1_epi64(k, bit_cast<Long>(a)));
300}
301
302// ---------------------------------------------------------------------------
303// mask_load v
304// ---------------------------------------------------------------------------
305
306#define GENERATE_LOAD(NAME, TYPE, SUF) \
307 static SIMD_INLINE Vec<TYPE, 64> mask_load( \
308 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const TYPE *const p) \
309 { \
310 /* AVX load and store instructions need alignment to 64 byte*/ \
311 /* (lower 6 bit need to be zero) */ \
312 SIMD_CHECK_ALIGNMENT(p, 64); \
313 return _mm512_mask_##NAME##_##SUF(src, k, p); \
314 } \
315 static SIMD_INLINE Vec<TYPE, 64> maskz_load(const Mask<TYPE, 64> &k, \
316 const TYPE *const p) \
317 { \
318 /* AVX load and store instructions need alignment to 64 byte*/ \
319 /* (lower 6 bit need to be zero) */ \
320 SIMD_CHECK_ALIGNMENT(p, 64); \
321 return _mm512_maskz_##NAME##_##SUF(k, p); \
322 }
323
324#ifdef __AVX512BW__
325// there is no aligned load for 8 and 16 bit types, so we use loadu
326GENERATE_LOAD(loadu, Byte, epi8)
327GENERATE_LOAD(loadu, SignedByte, epi8)
328GENERATE_LOAD(loadu, Word, epi16)
329GENERATE_LOAD(loadu, Short, epi16)
330#endif
331
332GENERATE_LOAD(load, Int, epi32)
333GENERATE_LOAD(load, Float, ps)
334GENERATE_LOAD(load, Long, epi64)
335GENERATE_LOAD(load, Double, pd)
336
337// ---------------------------------------------------------------------------
338// mask_loadu v
339// ---------------------------------------------------------------------------
340
341#define GENERATE_LOADU(TYPE, SUF) \
342 static SIMD_INLINE Vec<TYPE, 64> mask_loadu( \
343 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const TYPE *const p) \
344 { \
345 return _mm512_mask_loadu_##SUF(src, k, p); \
346 } \
347 static SIMD_INLINE Vec<TYPE, 64> maskz_loadu(const Mask<TYPE, 64> &k, \
348 const TYPE *const p) \
349 { \
350 return _mm512_maskz_loadu_##SUF(k, p); \
351 }
352
353#ifdef __AVX512BW__
354GENERATE_LOADU(Byte, epi8)
355GENERATE_LOADU(SignedByte, epi8)
356GENERATE_LOADU(Word, epi16)
357GENERATE_LOADU(Short, epi16)
358#endif
359
360GENERATE_LOADU(Int, epi32)
361GENERATE_LOADU(Float, ps)
362GENERATE_LOADU(Long, epi64)
363GENERATE_LOADU(Double, pd)
364
365// ---------------------------------------------------------------------------
366// mask_store v
367// ---------------------------------------------------------------------------
368
369// There are no *_maskz_store_* intrinsics, only *_mask_store_* intrinsics
370
371#define MASK_STORE(NAME, TYPE, SUF) \
372 static SIMD_INLINE void mask_store(TYPE *const p, const Mask<TYPE, 64> &k, \
373 const Vec<TYPE, 64> &a) \
374 { \
375 /* AVX load and store instructions need alignment to 64 byte*/ \
376 /* (lower 6 bit need to be zero) */ \
377 SIMD_CHECK_ALIGNMENT(p, 64); \
378 return _mm512_mask_##NAME##_##SUF(p, k, a); \
379 }
380
381#ifdef __AVX512BW__
382// there is no aligned store for 8 and 16 bit types, so we use storeu
383MASK_STORE(storeu, Byte, epi8)
384MASK_STORE(storeu, SignedByte, epi8)
385MASK_STORE(storeu, Word, epi16)
386MASK_STORE(storeu, Short, epi16)
387#endif
388
389MASK_STORE(store, Int, epi32)
390MASK_STORE(store, Float, ps)
391MASK_STORE(store, Long, epi64)
392MASK_STORE(store, Double, pd)
393
394// ---------------------------------------------------------------------------
395// mask_storeu v
396// ---------------------------------------------------------------------------
397
398// There are no *_maskz_storeu_* intrinsics, only *_mask_storeu_* intrinsics
399
400#define MASK_STOREU(TYPE, SUF) \
401 static SIMD_INLINE void mask_storeu(TYPE *const p, const Mask<TYPE, 64> &k, \
402 const Vec<TYPE, 64> &a) \
403 { \
404 return _mm512_mask_storeu_##SUF(p, k, a); \
405 }
406#ifdef __AVX512BW__
407MASK_STOREU(Byte, epi8)
408MASK_STOREU(SignedByte, epi8)
409MASK_STOREU(Word, epi16)
410MASK_STOREU(Short, epi16)
411#endif
412MASK_STOREU(Int, epi32)
413MASK_STOREU(Float, ps)
414MASK_STOREU(Long, epi64)
415MASK_STOREU(Double, pd)
416
417// ---------------------------------------------------------------------------
418// mask_add v
419// ---------------------------------------------------------------------------
420
421#ifdef __AVX512BW__
422GENERATE_DOP(add, Byte, epi8)
423GENERATE_DOP(add, SignedByte, epi8)
424GENERATE_DOP(add, Word, epi16)
425GENERATE_DOP(add, Short, epi16)
426#endif
427GENERATE_DOP(add, Int, epi32)
428GENERATE_DOP(add, Float, ps)
429GENERATE_DOP(add, Long, epi64)
430GENERATE_DOP(add, Double, pd)
431
432// ---------------------------------------------------------------------------
433// mask_adds v
434// ---------------------------------------------------------------------------
435
436#ifdef __AVX512BW__
437GENERATE_DOP(adds, Byte, epu8)
438GENERATE_DOP(adds, SignedByte, epi8)
439GENERATE_DOP(adds, Word, epu16)
440GENERATE_DOP(adds, Short, epi16)
441#endif
442
443// 09. Mar 23 (Jonas Keller): removed non saturating version of adds for Int and
444// Float, use the emulated versions in SIMDVecMaskImplEmu.H instead
445
446// ---------------------------------------------------------------------------
447// mask_sub v
448// ---------------------------------------------------------------------------
449
450#ifdef __AVX512BW__
451GENERATE_DOP(sub, Byte, epi8)
452GENERATE_DOP(sub, SignedByte, epi8)
453GENERATE_DOP(sub, Word, epi16)
454GENERATE_DOP(sub, Short, epi16)
455#endif
456GENERATE_DOP(sub, Int, epi32)
457GENERATE_DOP(sub, Float, ps)
458GENERATE_DOP(sub, Long, epi64)
459GENERATE_DOP(sub, Double, pd)
460
461// ---------------------------------------------------------------------------
462// mask_subs v
463// ---------------------------------------------------------------------------
464
465#ifdef __AVX512BW__
466GENERATE_DOP(subs, Byte, epu8)
467GENERATE_DOP(subs, SignedByte, epi8)
468GENERATE_DOP(subs, Word, epu16)
469GENERATE_DOP(subs, Short, epi16)
470#endif
471
472// 09. Mar 23 (Jonas Keller): removed non saturating version of subs for Int and
473// Float, use the emulated versions in SIMDVecMaskImplEmu.H instead
474
475// ---------------------------------------------------------------------------
476// mask_mul v
477// ---------------------------------------------------------------------------
478
479GENERATE_DOP(mul, Float, ps)
480GENERATE_DOP(mul, Double, pd)
481
482// ---------------------------------------------------------------------------
483// mask_div v
484// ---------------------------------------------------------------------------
485
486GENERATE_DOP(div, Float, ps)
487GENERATE_DOP(div, Double, pd)
488
489// ---------------------------------------------------------------------------
490// masked ceil, floor, round, truncate v
491// ---------------------------------------------------------------------------
492
493// 10. Apr 23 (Jonas Keller): added versions for integer types
494
495// versions for integer types do nothing:
496
497template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
498static SIMD_INLINE Vec<T, 64> mask_ceil(const Vec<T, 64> &src,
499 const Mask<T, 64> &k,
500 const Vec<T, 64> &a)
501{
502 return mask_ifelse(k, a, src);
503}
504
505template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
506static SIMD_INLINE Vec<T, 64> maskz_ceil(const Mask<T, 64> &k,
507 const Vec<T, 64> &a)
508{
509 return mask_ifelsezero(k, a);
510}
511
512template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
513static SIMD_INLINE Vec<T, 64> mask_floor(const Vec<T, 64> &src,
514 const Mask<T, 64> &k,
515 const Vec<T, 64> &a)
516{
517 return mask_ifelse(k, a, src);
518}
519
520template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
521static SIMD_INLINE Vec<T, 64> maskz_floor(const Mask<T, 64> &k,
522 const Vec<T, 64> &a)
523{
524 return mask_ifelsezero(k, a);
525}
526
527template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
528static SIMD_INLINE Vec<T, 64> mask_round(const Vec<T, 64> &src,
529 const Mask<T, 64> &k,
530 const Vec<T, 64> &a)
531{
532 return mask_ifelse(k, a, src);
533}
534
535template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
536static SIMD_INLINE Vec<T, 64> maskz_round(const Mask<T, 64> &k,
537 const Vec<T, 64> &a)
538{
539 return mask_ifelsezero(k, a);
540}
541
542template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
543static SIMD_INLINE Vec<T, 64> mask_truncate(const Vec<T, 64> &src,
544 const Mask<T, 64> &k,
545 const Vec<T, 64> &a)
546{
547 return mask_ifelse(k, a, src);
548}
549
550template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
551static SIMD_INLINE Vec<T, 64> maskz_truncate(const Mask<T, 64> &k,
552 const Vec<T, 64> &a)
553{
554 return mask_ifelsezero(k, a);
555}
556
557// Float versions:
558
559static SIMD_INLINE Vec<Float, 64> mask_ceil(const Vec<Float, 64> &src,
560 const Mask<Float, 64> &k,
561 const Vec<Float, 64> &a)
562{
563 return _mm512_mask_roundscale_ps(src, k, a,
564 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
565}
566
567static SIMD_INLINE Vec<Float, 64> maskz_ceil(const Mask<Float, 64> &k,
568 const Vec<Float, 64> &a)
569{
570 return _mm512_maskz_roundscale_ps(k, a,
571 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
572}
573
574static SIMD_INLINE Vec<Float, 64> mask_floor(const Vec<Float, 64> &src,
575 const Mask<Float, 64> &k,
576 const Vec<Float, 64> &a)
577{
578 return _mm512_mask_roundscale_ps(src, k, a,
579 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
580}
581
582static SIMD_INLINE Vec<Float, 64> maskz_floor(const Mask<Float, 64> &k,
583 const Vec<Float, 64> &a)
584{
585 return _mm512_maskz_roundscale_ps(k, a,
586 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
587}
588
589static SIMD_INLINE Vec<Float, 64> mask_round(const Vec<Float, 64> &src,
590 const Mask<Float, 64> &k,
591 const Vec<Float, 64> &a)
592{
593 return _mm512_mask_roundscale_ps(
594 src, k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
595}
596
597static SIMD_INLINE Vec<Float, 64> maskz_round(const Mask<Float, 64> &k,
598 const Vec<Float, 64> &a)
599{
600 return _mm512_maskz_roundscale_ps(
601 k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
602}
603
604static SIMD_INLINE Vec<Float, 64> mask_truncate(const Vec<Float, 64> &src,
605 const Mask<Float, 64> &k,
606 const Vec<Float, 64> &a)
607{
608 return _mm512_mask_roundscale_ps(src, k, a,
609 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
610}
611
612static SIMD_INLINE Vec<Float, 64> maskz_truncate(const Mask<Float, 64> &k,
613 const Vec<Float, 64> &a)
614{
615 return _mm512_maskz_roundscale_ps(k, a,
616 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
617}
618
619// Double versions:
620
621static SIMD_INLINE Vec<Double, 64> mask_ceil(const Vec<Double, 64> &src,
622 const Mask<Double, 64> &k,
623 const Vec<Double, 64> &a)
624{
625 return _mm512_mask_roundscale_pd(src, k, a,
626 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
627}
628
629static SIMD_INLINE Vec<Double, 64> maskz_ceil(const Mask<Double, 64> &k,
630 const Vec<Double, 64> &a)
631{
632 return _mm512_maskz_roundscale_pd(k, a,
633 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
634}
635
636static SIMD_INLINE Vec<Double, 64> mask_floor(const Vec<Double, 64> &src,
637 const Mask<Double, 64> &k,
638 const Vec<Double, 64> &a)
639{
640 return _mm512_mask_roundscale_pd(src, k, a,
641 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
642}
643
644static SIMD_INLINE Vec<Double, 64> maskz_floor(const Mask<Double, 64> &k,
645 const Vec<Double, 64> &a)
646{
647 return _mm512_maskz_roundscale_pd(k, a,
648 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
649}
650
651static SIMD_INLINE Vec<Double, 64> mask_round(const Vec<Double, 64> &src,
652 const Mask<Double, 64> &k,
653 const Vec<Double, 64> &a)
654{
655 return _mm512_mask_roundscale_pd(
656 src, k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
657}
658
659static SIMD_INLINE Vec<Double, 64> maskz_round(const Mask<Double, 64> &k,
660 const Vec<Double, 64> &a)
661{
662 return _mm512_maskz_roundscale_pd(
663 k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
664}
665
666static SIMD_INLINE Vec<Double, 64> mask_truncate(const Vec<Double, 64> &src,
667 const Mask<Double, 64> &k,
668 const Vec<Double, 64> &a)
669{
670 return _mm512_mask_roundscale_pd(src, k, a,
671 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
672}
673
674static SIMD_INLINE Vec<Double, 64> maskz_truncate(const Mask<Double, 64> &k,
675 const Vec<Double, 64> &a)
676{
677 return _mm512_maskz_roundscale_pd(k, a,
678 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
679}
680
681// ---------------------------------------------------------------------------
682// masked elementary mathematical functions v
683// ---------------------------------------------------------------------------
684
685// masked estimate of a reciprocal
686// NOTE: this has better precision than SSE and AVX versions!
687static SIMD_INLINE Vec<Float, 64> mask_rcp(const Vec<Float, 64> &src,
688 const Mask<Float, 64> &k,
689 const Vec<Float, 64> &a)
690{
691 return _mm512_mask_rcp14_ps(src, k, a);
692}
693
694static SIMD_INLINE Vec<Float, 64> maskz_rcp(const Mask<Float, 64> &k,
695 const Vec<Float, 64> &a)
696{
697 return _mm512_maskz_rcp14_ps(k, a);
698}
699
700static SIMD_INLINE Vec<Double, 64> mask_rcp(const Vec<Double, 64> &src,
701 const Mask<Double, 64> &k,
702 const Vec<Double, 64> &a)
703{
704 return _mm512_mask_rcp14_pd(src, k, a);
705}
706
707static SIMD_INLINE Vec<Double, 64> maskz_rcp(const Mask<Double, 64> &k,
708 const Vec<Double, 64> &a)
709{
710 return _mm512_maskz_rcp14_pd(k, a);
711}
712
713// masked estimate of reverse square root
714// NOTE: this has better precision than SSE and AVX versions!
715static SIMD_INLINE Vec<Float, 64> mask_rsqrt(const Vec<Float, 64> &src,
716 const Mask<Float, 64> &k,
717 const Vec<Float, 64> &a)
718{
719 return _mm512_mask_rsqrt14_ps(src, k, a);
720}
721
722static SIMD_INLINE Vec<Float, 64> maskz_rsqrt(const Mask<Float, 64> &k,
723 const Vec<Float, 64> &a)
724{
725 return _mm512_maskz_rsqrt14_ps(k, a);
726}
727
728static SIMD_INLINE Vec<Double, 64> mask_rsqrt(const Vec<Double, 64> &src,
729 const Mask<Double, 64> &k,
730 const Vec<Double, 64> &a)
731{
732 return _mm512_mask_rsqrt14_pd(src, k, a);
733}
734
735static SIMD_INLINE Vec<Double, 64> maskz_rsqrt(const Mask<Double, 64> &k,
736 const Vec<Double, 64> &a)
737{
738 return _mm512_maskz_rsqrt14_pd(k, a);
739}
740
741// masked square root
742GENERATE_SOP(sqrt, Float, ps)
743GENERATE_SOP(sqrt, Double, pd)
744
745// ---------------------------------------------------------------------------
746// masked_abs v
747// ---------------------------------------------------------------------------
748
749// 25. Mar 25 (Jonas Keller): added masked abs for unsigned integers
750
751// unsigned integers: do nothing
752template <typename T, SIMD_ENABLE_IF(std::is_unsigned<T>::value
753 &&std::is_integral<T>::value)>
754static SIMD_INLINE Vec<T, 64> mask_abs(const Vec<T, 64> &src,
755 const Mask<T, 64> &k,
756 const Vec<T, 64> &a)
757{
758 return mask_ifelse(k, a, src);
759}
760
761template <typename T, SIMD_ENABLE_IF(std::is_unsigned<T>::value
762 &&std::is_integral<T>::value)>
763static SIMD_INLINE Vec<T, 64> maskz_abs(const Mask<T, 64> &k,
764 const Vec<T, 64> &a)
765{
766 return mask_ifelsezero(k, a);
767}
768
769#ifdef __AVX512BW__
770GENERATE_SOP(abs, SignedByte, epi8)
771GENERATE_SOP(abs, Short, epi16)
772#endif
773GENERATE_SOP(abs, Int, epi32)
774GENERATE_SOP(abs, Long, epi64)
775
776MASK_SOP(abs, Float, ps)
777
778// There is no _mm512_maskz_abs_ps
779static SIMD_INLINE Vec<Float, 64> maskz_abs(const Mask<Float, 64> &k,
780 const Vec<Float, 64> &a)
781{
782 return _mm512_mask_abs_ps(::simd::setzero<Float, 64>(), k, a);
783}
784
785MASK_SOP(abs, Double, pd)
786
787// There is no _mm512_maskz_abs_pd
788static SIMD_INLINE Vec<Double, 64> maskz_abs(const Mask<Double, 64> &k,
789 const Vec<Double, 64> &a)
790{
791 return _mm512_mask_abs_pd(::simd::setzero<Double, 64>(), k, a);
792}
793
794// ---------------------------------------------------------------------------
795// mask_and v
796// ---------------------------------------------------------------------------
797
798// there is no _mm512_mask_and_epi8 or _mm512_mask_and_epi16
799GENERATE_DOP(and, Int, epi32)
800GENERATE_DOP(and, Long, epi64)
801#ifdef __AVX512DQ__
802GENERATE_DOP(and, Float, ps)
803GENERATE_DOP(and, Double, pd)
804#else
805// Workaround with the epi32/64-versions and casts
806static SIMD_INLINE Vec<Float, 64> mask_and(const Vec<Float, 64> &src,
807 const Mask<Float, 64> &k,
808 const Vec<Float, 64> &a,
809 const Vec<Float, 64> &b)
810{
811 return _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_castps_si512(src), k,
812 _mm512_castps_si512(a),
813 _mm512_castps_si512(b)));
814}
815
816static SIMD_INLINE Vec<Float, 64> maskz_and(const Mask<Float, 64> &k,
817 const Vec<Float, 64> &a,
818 const Vec<Float, 64> &b)
819{
820 return _mm512_castsi512_ps(
821 _mm512_maskz_and_epi32(k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
822}
823
824static SIMD_INLINE Vec<Double, 64> mask_and(const Vec<Double, 64> &src,
825 const Mask<Double, 64> &k,
826 const Vec<Double, 64> &a,
827 const Vec<Double, 64> &b)
828{
829 return _mm512_castsi512_pd(_mm512_mask_and_epi64(_mm512_castpd_si512(src), k,
830 _mm512_castpd_si512(a),
831 _mm512_castpd_si512(b)));
832}
833
834static SIMD_INLINE Vec<Double, 64> maskz_and(const Mask<Double, 64> &k,
835 const Vec<Double, 64> &a,
836 const Vec<Double, 64> &b)
837{
838 return _mm512_castsi512_pd(
839 _mm512_maskz_and_epi64(k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
840}
841#endif
842
843// ---------------------------------------------------------------------------
844// mask_or v
845// ---------------------------------------------------------------------------
846
847// there is no _mm512_mask_or_epi8 or _mm512_mask_or_epi16
848GENERATE_DOP(or, Int, epi32)
849GENERATE_DOP(or, Long, epi64)
850#ifdef __AVX512DQ__
851GENERATE_DOP(or, Float, ps)
852GENERATE_DOP(or, Double, pd)
853#else
854// Workaround with the epi32/64-versions and casts
855static SIMD_INLINE Vec<Float, 64> mask_or(const Vec<Float, 64> &src,
856 const Mask<Float, 64> &k,
857 const Vec<Float, 64> &a,
858 const Vec<Float, 64> &b)
859{
860 return _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_castps_si512(src), k,
861 _mm512_castps_si512(a),
862 _mm512_castps_si512(b)));
863}
864
865static SIMD_INLINE Vec<Float, 64> maskz_or(const Mask<Float, 64> &k,
866 const Vec<Float, 64> &a,
867 const Vec<Float, 64> &b)
868{
869 return _mm512_castsi512_ps(
870 _mm512_maskz_or_epi32(k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
871}
872
873static SIMD_INLINE Vec<Double, 64> mask_or(const Vec<Double, 64> &src,
874 const Mask<Double, 64> &k,
875 const Vec<Double, 64> &a,
876 const Vec<Double, 64> &b)
877{
878 return _mm512_castsi512_pd(_mm512_mask_or_epi64(_mm512_castpd_si512(src), k,
879 _mm512_castpd_si512(a),
880 _mm512_castpd_si512(b)));
881}
882
883static SIMD_INLINE Vec<Double, 64> maskz_or(const Mask<Double, 64> &k,
884 const Vec<Double, 64> &a,
885 const Vec<Double, 64> &b)
886{
887 return _mm512_castsi512_pd(
888 _mm512_maskz_or_epi64(k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
889}
890#endif
891
892// ---------------------------------------------------------------------------
893// mask_andnot v
894// ---------------------------------------------------------------------------
895
896// there is no _mm512_mask_andnot_epi8 or _mm512_mask_andnot_epi16
897GENERATE_DOP(andnot, Int, epi32)
898GENERATE_DOP(andnot, Long, epi64)
899#ifdef __AVX512DQ__
900GENERATE_DOP(andnot, Float, ps)
901GENERATE_DOP(andnot, Double, pd)
902#else
903// Workaround with the epi32/64-versions and casts
904static SIMD_INLINE Vec<Float, 64> mask_andnot(const Vec<Float, 64> &src,
905 const Mask<Float, 64> &k,
906 const Vec<Float, 64> &a,
907 const Vec<Float, 64> &b)
908{
909 return _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_castps_si512(src),
910 k, _mm512_castps_si512(a),
911 _mm512_castps_si512(b)));
912}
913
914static SIMD_INLINE Vec<Float, 64> maskz_andnot(const Mask<Float, 64> &k,
915 const Vec<Float, 64> &a,
916 const Vec<Float, 64> &b)
917{
918 return _mm512_castsi512_ps(_mm512_maskz_andnot_epi32(
919 k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
920}
921
922static SIMD_INLINE Vec<Double, 64> mask_andnot(const Vec<Double, 64> &src,
923 const Mask<Double, 64> &k,
924 const Vec<Double, 64> &a,
925 const Vec<Double, 64> &b)
926{
927 return _mm512_castsi512_pd(_mm512_mask_andnot_epi64(_mm512_castpd_si512(src),
928 k, _mm512_castpd_si512(a),
929 _mm512_castpd_si512(b)));
930}
931
932static SIMD_INLINE Vec<Double, 64> maskz_andnot(const Mask<Double, 64> &k,
933 const Vec<Double, 64> &a,
934 const Vec<Double, 64> &b)
935{
936 return _mm512_castsi512_pd(_mm512_maskz_andnot_epi64(
937 k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
938}
939#endif
940
941// ---------------------------------------------------------------------------
942// mask_xor v
943// ---------------------------------------------------------------------------
944
945// there is no _mm512_mask_xor_epi8 or _mm512_mask_xor_epi16
946GENERATE_DOP(xor, Int, epi32)
947GENERATE_DOP(xor, Long, epi64)
948#ifdef __AVX512DQ__
949GENERATE_DOP(xor, Float, ps)
950GENERATE_DOP(xor, Double, pd)
951#else
952// Workaround with the epi32/64-versions and casts
953static SIMD_INLINE Vec<Float, 64> mask_xor(const Vec<Float, 64> &src,
954 const Mask<Float, 64> &k,
955 const Vec<Float, 64> &a,
956 const Vec<Float, 64> &b)
957{
958 return _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(src), k,
959 _mm512_castps_si512(a),
960 _mm512_castps_si512(b)));
961}
962
963static SIMD_INLINE Vec<Float, 64> maskz_xor(const Mask<Float, 64> &k,
964 const Vec<Float, 64> &a,
965 const Vec<Float, 64> &b)
966{
967 return _mm512_castsi512_ps(
968 _mm512_maskz_xor_epi32(k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
969}
970
971static SIMD_INLINE Vec<Double, 64> mask_xor(const Vec<Double, 64> &src,
972 const Mask<Double, 64> &k,
973 const Vec<Double, 64> &a,
974 const Vec<Double, 64> &b)
975{
976 return _mm512_castsi512_pd(_mm512_mask_xor_epi64(_mm512_castpd_si512(src), k,
977 _mm512_castpd_si512(a),
978 _mm512_castpd_si512(b)));
979}
980
981static SIMD_INLINE Vec<Double, 64> maskz_xor(const Mask<Double, 64> &k,
982 const Vec<Double, 64> &a,
983 const Vec<Double, 64> &b)
984{
985 return _mm512_castsi512_pd(
986 _mm512_maskz_xor_epi64(k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
987}
988#endif
989
990// ---------------------------------------------------------------------------
991// mask_not v
992// ---------------------------------------------------------------------------
993
994// 08. Apr 23 (Jonas Keller): added mask_not and maskz_not
995
996// There is no masked "not"-intrinsic, so use the masked xor with all ones
997
998// there are no masked xor intrinsics for epi8 and epi16
999
1000// Int
1001static SIMD_INLINE Vec<Int, 64> mask_not(const Vec<Int, 64> &src,
1002 const Mask<Int, 64> &k,
1003 const Vec<Int, 64> &a)
1004{
1005 return _mm512_mask_xor_epi32(src, k, a, _mm512_set1_epi32(-1));
1006}
1007static SIMD_INLINE Vec<Int, 64> maskz_not(const Mask<Int, 64> &k,
1008 const Vec<Int, 64> &a)
1009{
1010 return _mm512_maskz_xor_epi32(k, a, _mm512_set1_epi32(-1));
1011}
1012
1013// Long
1014static SIMD_INLINE Vec<Long, 64> mask_not(const Vec<Long, 64> &src,
1015 const Mask<Long, 64> &k,
1016 const Vec<Long, 64> &a)
1017{
1018 return _mm512_mask_xor_epi64(src, k, a, _mm512_set1_epi64(-1));
1019}
1020static SIMD_INLINE Vec<Long, 64> maskz_not(const Mask<Long, 64> &k,
1021 const Vec<Long, 64> &a)
1022{
1023 return _mm512_maskz_xor_epi64(k, a, _mm512_set1_epi64(-1));
1024}
1025#ifdef __AVX512DQ__
1026// Float
1027static SIMD_INLINE Vec<Float, 64> mask_not(const Vec<Float, 64> &src,
1028 const Mask<Float, 64> &k,
1029 const Vec<Float, 64> &a)
1030{
1031 return _mm512_mask_xor_ps(src, k, a,
1032 _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
1033}
1034static SIMD_INLINE Vec<Float, 64> maskz_not(const Mask<Float, 64> &k,
1035 const Vec<Float, 64> &a)
1036{
1037 return _mm512_maskz_xor_ps(k, a, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
1038}
1039
1040// Double
1041static SIMD_INLINE Vec<Double, 64> mask_not(const Vec<Double, 64> &src,
1042 const Mask<Double, 64> &k,
1043 const Vec<Double, 64> &a)
1044{
1045 return _mm512_mask_xor_pd(src, k, a,
1046 _mm512_castsi512_pd(_mm512_set1_epi64(-1)));
1047}
1048static SIMD_INLINE Vec<Double, 64> maskz_not(const Mask<Double, 64> &k,
1049 const Vec<Double, 64> &a)
1050{
1051 return _mm512_maskz_xor_pd(k, a, _mm512_castsi512_pd(_mm512_set1_epi64(-1)));
1052}
1053#else
1054// Workaround with the epi32/64-versions and casts
1055static SIMD_INLINE Vec<Float, 64> mask_not(const Vec<Float, 64> &src,
1056 const Mask<Float, 64> &k,
1057 const Vec<Float, 64> &a)
1058{
1059 return _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(src), k,
1060 _mm512_castps_si512(a),
1061 _mm512_set1_epi32(-1)));
1062}
1063
1064static SIMD_INLINE Vec<Float, 64> maskz_not(const Mask<Float, 64> &k,
1065 const Vec<Float, 64> &a)
1066{
1067 return _mm512_castsi512_ps(
1068 _mm512_maskz_xor_epi32(k, _mm512_castps_si512(a), _mm512_set1_epi32(-1)));
1069}
1070
1071static SIMD_INLINE Vec<Double, 64> mask_not(const Vec<Double, 64> &src,
1072 const Mask<Double, 64> &k,
1073 const Vec<Double, 64> &a)
1074{
1075 return _mm512_castsi512_pd(_mm512_mask_xor_epi64(_mm512_castpd_si512(src), k,
1076 _mm512_castpd_si512(a),
1077 _mm512_set1_epi64(-1)));
1078}
1079
1080static SIMD_INLINE Vec<Double, 64> maskz_not(const Mask<Double, 64> &k,
1081 const Vec<Double, 64> &a)
1082{
1083 return _mm512_castsi512_pd(
1084 _mm512_maskz_xor_epi64(k, _mm512_castpd_si512(a), _mm512_set1_epi64(-1)));
1085}
1086#endif
1087
1088// ---------------------------------------------------------------------------
1089// mask_neg (negate = two's complement or unary minus), only signed types v
1090// ---------------------------------------------------------------------------
1091
1092#define GENERATE_NEG(TYPE, SUF) \
1093 static SIMD_INLINE Vec<TYPE, 64> mask_neg( \
1094 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a) \
1095 { \
1096 return _mm512_mask_sub_##SUF(src, k, setzero<TYPE, 64>(), a); \
1097 } \
1098 static SIMD_INLINE Vec<TYPE, 64> maskz_neg(const Mask<TYPE, 64> &k, \
1099 const Vec<TYPE, 64> &a) \
1100 { \
1101 return _mm512_maskz_sub_##SUF(k, setzero<TYPE, 64>(), a); \
1102 }
1103
1104#ifdef __AVX512BW__
1105GENERATE_NEG(SignedByte, epi8)
1106GENERATE_NEG(Short, epi16)
1107#endif
1108GENERATE_NEG(Int, epi32)
1109GENERATE_NEG(Float, ps)
1110GENERATE_NEG(Long, epi64)
1111GENERATE_NEG(Double, pd)
1112
1113// ---------------------------------------------------------------------------
1114// mask_min v
1115// ---------------------------------------------------------------------------
1116
1117#ifdef __AVX512BW__
1118GENERATE_DOP(min, Byte, epu8)
1119GENERATE_DOP(min, SignedByte, epi8)
1120GENERATE_DOP(min, Word, epu16)
1121GENERATE_DOP(min, Short, epi16)
1122#endif
1123GENERATE_DOP(min, Int, epi32)
1124GENERATE_DOP(min, Float, ps)
1125GENERATE_DOP(min, Long, epi64)
1126GENERATE_DOP(min, Double, pd)
1127
1128// ---------------------------------------------------------------------------
1129// mask_max v
1130// ---------------------------------------------------------------------------
1131
1132#ifdef __AVX512BW__
1133GENERATE_DOP(max, Byte, epu8)
1134GENERATE_DOP(max, SignedByte, epi8)
1135GENERATE_DOP(max, Word, epu16)
1136GENERATE_DOP(max, Short, epi16)
1137#endif
1138GENERATE_DOP(max, Int, epi32)
1139GENERATE_DOP(max, Float, ps)
1140GENERATE_DOP(max, Long, epi64)
1141GENERATE_DOP(max, Double, pd)
1142
1143// ---------------------------------------------------------------------------
1144// masked srai (16,32,64 only) v
1145// ---------------------------------------------------------------------------
1146
1147#ifdef __AVX512BW__
1148template <size_t COUNT>
1149static SIMD_INLINE Vec<Word, 64> mask_srai(const Vec<Word, 64> &src,
1150 const Mask<Word, 64> &k,
1151 const Vec<Word, 64> &a)
1152{
1153 return _mm512_mask_srai_epi16(src, k, a, vec::min(COUNT, 15ul));
1154}
1155
1156template <size_t COUNT>
1157static SIMD_INLINE Vec<Word, 64> maskz_srai(const Mask<Word, 64> &k,
1158 const Vec<Word, 64> &a)
1159{
1160 return _mm512_maskz_srai_epi16(k, a, vec::min(COUNT, 15ul));
1161}
1162
1163template <size_t COUNT>
1164static SIMD_INLINE Vec<Short, 64> mask_srai(const Vec<Short, 64> &src,
1165 const Mask<Short, 64> &k,
1166 const Vec<Short, 64> &a)
1167{
1168 return _mm512_mask_srai_epi16(src, k, a, vec::min(COUNT, 15ul));
1169}
1170
1171template <size_t COUNT>
1172static SIMD_INLINE Vec<Short, 64> maskz_srai(const Mask<Short, 64> &k,
1173 const Vec<Short, 64> &a)
1174{
1175 return _mm512_maskz_srai_epi16(k, a, vec::min(COUNT, 15ul));
1176}
1177
1178#endif
1179
1180template <size_t COUNT>
1181static SIMD_INLINE Vec<Int, 64> mask_srai(const Vec<Int, 64> &src,
1182 const Mask<Int, 64> &k,
1183 const Vec<Int, 64> &a)
1184{
1185 return _mm512_mask_srai_epi32(src, k, a, vec::min(COUNT, 31ul));
1186}
1187
1188template <size_t COUNT>
1189static SIMD_INLINE Vec<Int, 64> maskz_srai(const Mask<Int, 64> &k,
1190 const Vec<Int, 64> &a)
1191{
1192 return _mm512_maskz_srai_epi32(k, a, vec::min(COUNT, 31ul));
1193}
1194
1195template <size_t COUNT>
1196static SIMD_INLINE Vec<Long, 64> mask_srai(const Vec<Long, 64> &src,
1197 const Mask<Long, 64> &k,
1198 const Vec<Long, 64> &a)
1199{
1200 return _mm512_mask_srai_epi64(src, k, a, vec::min(COUNT, 63ul));
1201}
1202
1203template <size_t COUNT>
1204static SIMD_INLINE Vec<Long, 64> maskz_srai(const Mask<Long, 64> &k,
1205 const Vec<Long, 64> &a)
1206{
1207 return _mm512_maskz_srai_epi64(k, a, vec::min(COUNT, 63ul));
1208}
1209
1210// ---------------------------------------------------------------------------
1211// masked srli v
1212// ---------------------------------------------------------------------------
1213
1214#ifdef __AVX512BW__
1215template <size_t COUNT>
1216static SIMD_INLINE Vec<Word, 64> mask_srli(const Vec<Word, 64> &src,
1217 const Mask<Word, 64> &k,
1218 const Vec<Word, 64> &a)
1219{
1220 SIMD_IF_CONSTEXPR (COUNT < 16) {
1221 return _mm512_mask_srli_epi16(src, k, a, COUNT);
1222 } else {
1223 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1224 }
1225}
1226
1227template <size_t COUNT>
1228static SIMD_INLINE Vec<Word, 64> maskz_srli(const Mask<Word, 64> &k,
1229 const Vec<Word, 64> &a)
1230{
1231 SIMD_IF_CONSTEXPR (COUNT < 16) {
1232 return _mm512_maskz_srli_epi16(k, a, COUNT);
1233 } else {
1234 return _mm512_setzero_si512();
1235 }
1236}
1237
1238template <size_t COUNT>
1239static SIMD_INLINE Vec<Short, 64> mask_srli(const Vec<Short, 64> &src,
1240 const Mask<Short, 64> &k,
1241 const Vec<Short, 64> &a)
1242{
1243 SIMD_IF_CONSTEXPR (COUNT < 16) {
1244 return _mm512_mask_srli_epi16(src, k, a, COUNT);
1245 } else {
1246 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1247 }
1248}
1249
1250template <size_t COUNT>
1251static SIMD_INLINE Vec<Short, 64> maskz_srli(const Mask<Short, 64> &k,
1252 const Vec<Short, 64> &a)
1253{
1254 SIMD_IF_CONSTEXPR (COUNT < 16) {
1255 return _mm512_maskz_srli_epi16(k, a, COUNT);
1256 } else {
1257 return _mm512_setzero_si512();
1258 }
1259}
1260
1261#endif
1262
1263template <size_t COUNT>
1264static SIMD_INLINE Vec<Int, 64> mask_srli(const Vec<Int, 64> &src,
1265 const Mask<Int, 64> &k,
1266 const Vec<Int, 64> &a)
1267{
1268 SIMD_IF_CONSTEXPR (COUNT < 32) {
1269 return _mm512_mask_srli_epi32(src, k, a, COUNT);
1270 } else {
1271 return _mm512_mask_blend_epi32(k, src, _mm512_setzero_si512());
1272 }
1273}
1274
1275template <size_t COUNT>
1276static SIMD_INLINE Vec<Int, 64> maskz_srli(const Mask<Int, 64> &k,
1277 const Vec<Int, 64> &a)
1278{
1279 SIMD_IF_CONSTEXPR (COUNT < 32) {
1280 return _mm512_maskz_srli_epi32(k, a, COUNT);
1281 } else {
1282 return _mm512_setzero_si512();
1283 }
1284}
1285
1286template <size_t COUNT>
1287static SIMD_INLINE Vec<Long, 64> mask_srli(const Vec<Long, 64> &src,
1288 const Mask<Long, 64> &k,
1289 const Vec<Long, 64> &a)
1290{
1291 SIMD_IF_CONSTEXPR (COUNT < 64) {
1292 return _mm512_mask_srli_epi64(src, k, a, COUNT);
1293 } else {
1294 return _mm512_mask_blend_epi64(k, src, _mm512_setzero_si512());
1295 }
1296}
1297
1298template <size_t COUNT>
1299static SIMD_INLINE Vec<Long, 64> maskz_srli(const Mask<Long, 64> &k,
1300 const Vec<Long, 64> &a)
1301{
1302 SIMD_IF_CONSTEXPR (COUNT < 64) {
1303 return _mm512_maskz_srli_epi64(k, a, COUNT);
1304 } else {
1305 return _mm512_setzero_si512();
1306 }
1307}
1308
1309// ---------------------------------------------------------------------------
1310// masked slli v
1311// ---------------------------------------------------------------------------
1312
1313#ifdef __AVX512BW__
1314template <size_t COUNT>
1315static SIMD_INLINE Vec<Word, 64> mask_slli(const Vec<Word, 64> &src,
1316 const Mask<Word, 64> &k,
1317 const Vec<Word, 64> &a)
1318{
1319 SIMD_IF_CONSTEXPR (COUNT < 16) {
1320 return _mm512_mask_slli_epi16(src, k, a, COUNT);
1321 } else {
1322 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1323 }
1324}
1325
1326template <size_t COUNT>
1327static SIMD_INLINE Vec<Word, 64> maskz_slli(const Mask<Word, 64> &k,
1328 const Vec<Word, 64> &a)
1329{
1330 SIMD_IF_CONSTEXPR (COUNT < 16) {
1331 return _mm512_maskz_slli_epi16(k, a, COUNT);
1332 } else {
1333 return _mm512_setzero_si512();
1334 }
1335}
1336
1337template <size_t COUNT>
1338static SIMD_INLINE Vec<Short, 64> mask_slli(const Vec<Short, 64> &src,
1339 const Mask<Short, 64> &k,
1340 const Vec<Short, 64> &a)
1341{
1342 SIMD_IF_CONSTEXPR (COUNT < 16) {
1343 return _mm512_mask_slli_epi16(src, k, a, COUNT);
1344 } else {
1345 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1346 }
1347}
1348
1349template <size_t COUNT>
1350static SIMD_INLINE Vec<Short, 64> maskz_slli(const Mask<Short, 64> &k,
1351 const Vec<Short, 64> &a)
1352{
1353 SIMD_IF_CONSTEXPR (COUNT < 16) {
1354 return _mm512_maskz_slli_epi16(k, a, COUNT);
1355 } else {
1356 return _mm512_setzero_si512();
1357 }
1358}
1359
1360#endif
1361
1362template <size_t COUNT>
1363static SIMD_INLINE Vec<Int, 64> mask_slli(const Vec<Int, 64> &src,
1364 const Mask<Int, 64> &k,
1365 const Vec<Int, 64> &a)
1366{
1367 SIMD_IF_CONSTEXPR (COUNT < 32) {
1368 return _mm512_mask_slli_epi32(src, k, a, COUNT);
1369 } else {
1370 return _mm512_mask_blend_epi32(k, src, _mm512_setzero_si512());
1371 }
1372}
1373
1374template <size_t COUNT>
1375static SIMD_INLINE Vec<Int, 64> maskz_slli(const Mask<Int, 64> &k,
1376 const Vec<Int, 64> &a)
1377{
1378 SIMD_IF_CONSTEXPR (COUNT < 32) {
1379 return _mm512_maskz_slli_epi32(k, a, COUNT);
1380 } else {
1381 return _mm512_setzero_si512();
1382 }
1383}
1384
1385template <size_t COUNT>
1386static SIMD_INLINE Vec<Long, 64> mask_slli(const Vec<Long, 64> &src,
1387 const Mask<Long, 64> &k,
1388 const Vec<Long, 64> &a)
1389{
1390 SIMD_IF_CONSTEXPR (COUNT < 64) {
1391 return _mm512_mask_slli_epi64(src, k, a, COUNT);
1392 } else {
1393 return _mm512_mask_blend_epi64(k, src, _mm512_setzero_si512());
1394 }
1395}
1396
1397template <size_t COUNT>
1398static SIMD_INLINE Vec<Long, 64> maskz_slli(const Mask<Long, 64> &k,
1399 const Vec<Long, 64> &a)
1400{
1401 SIMD_IF_CONSTEXPR (COUNT < 64) {
1402 return _mm512_maskz_slli_epi64(k, a, COUNT);
1403 } else {
1404 return _mm512_setzero_si512();
1405 }
1406}
1407
1408// 05. Aug 22 (Jonas Keller):
1409// Improved implementation of masked hadd, hadds, hsub and hsubs,
1410// implementation uses masked add/adds/sub/subs directly now instead of
1411// wrapping hadd, hadds, hsub and hsubs with a mask_ifelse(zero).
1412// Byte and SignedByte are now supported as well.
1413
1414// ---------------------------------------------------------------------------
1415// masked hadd v
1416// ---------------------------------------------------------------------------
1417
1418template <typename T>
1419static SIMD_INLINE Vec<T, 64> mask_hadd(const Vec<T, 64> &src,
1420 const Mask<T, 64> &k,
1421 const Vec<T, 64> &a,
1422 const Vec<T, 64> &b)
1423{
1424 Vec<T, 64> x, y;
1425 unzip<1>(a, b, x, y);
1426 return internal::mask::mask_add(src, k, x, y);
1427}
1428
1429template <typename T>
1430static SIMD_INLINE Vec<T, 64> maskz_hadd(const Mask<T, 64> &k,
1431 const Vec<T, 64> &a,
1432 const Vec<T, 64> &b)
1433{
1434 Vec<T, 64> x, y;
1435 unzip<1>(a, b, x, y);
1436 return internal::mask::maskz_add(k, x, y);
1437}
1438
1439// ---------------------------------------------------------------------------
1440// masked hadds v
1441// ---------------------------------------------------------------------------
1442
1443template <typename T>
1444static SIMD_INLINE Vec<T, 64> mask_hadds(const Vec<T, 64> &src,
1445 const Mask<T, 64> &k,
1446 const Vec<T, 64> &a,
1447 const Vec<T, 64> &b)
1448{
1449 Vec<T, 64> x, y;
1450 unzip<1>(a, b, x, y);
1451 return internal::mask::mask_adds(src, k, x, y);
1452}
1453
1454template <typename T>
1455static SIMD_INLINE Vec<T, 64> maskz_hadds(const Mask<T, 64> &k,
1456 const Vec<T, 64> &a,
1457 const Vec<T, 64> &b)
1458{
1459 Vec<T, 64> x, y;
1460 unzip<1>(a, b, x, y);
1461 return internal::mask::maskz_adds(k, x, y);
1462}
1463
1464// ---------------------------------------------------------------------------
1465// masked hsub v
1466// ---------------------------------------------------------------------------
1467
1468template <typename T>
1469static SIMD_INLINE Vec<T, 64> mask_hsub(const Vec<T, 64> &src,
1470 const Mask<T, 64> &k,
1471 const Vec<T, 64> &a,
1472 const Vec<T, 64> &b)
1473{
1474 Vec<T, 64> x, y;
1475 unzip<1>(a, b, x, y);
1476 return internal::mask::mask_sub(src, k, x, y);
1477}
1478
1479template <typename T>
1480static SIMD_INLINE Vec<T, 64> maskz_hsub(const Mask<T, 64> &k,
1481 const Vec<T, 64> &a,
1482 const Vec<T, 64> &b)
1483{
1484 Vec<T, 64> x, y;
1485 unzip<1>(a, b, x, y);
1486 return internal::mask::maskz_sub(k, x, y);
1487}
1488
1489// ---------------------------------------------------------------------------
1490// masked hsubs v
1491// ---------------------------------------------------------------------------
1492
1493template <typename T>
1494static SIMD_INLINE Vec<T, 64> mask_hsubs(const Vec<T, 64> &src,
1495 const Mask<T, 64> &k,
1496 const Vec<T, 64> &a,
1497 const Vec<T, 64> &b)
1498{
1499 Vec<T, 64> x, y;
1500 unzip<1>(a, b, x, y);
1501 return internal::mask::mask_subs(src, k, x, y);
1502}
1503
1504template <typename T>
1505static SIMD_INLINE Vec<T, 64> maskz_hsubs(const Mask<T, 64> &k,
1506 const Vec<T, 64> &a,
1507 const Vec<T, 64> &b)
1508{
1509 Vec<T, 64> x, y;
1510 unzip<1>(a, b, x, y);
1511 return internal::mask::maskz_subs(k, x, y);
1512}
1513
1514// 16. Oct 22 (Jonas Keller): added overloaded versions of mask_cmp* functions
1515// that only take two vector parameters and no mask parameter
1516
1517#define GENERATE_CMP(OP, TYPE, SUF) \
1518 static SIMD_INLINE Mask<TYPE, 64> mask_##OP( \
1519 const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, const Vec<TYPE, 64> &b) \
1520 { \
1521 return _mm512_mask_##OP##_##SUF##_mask(k, a, b); \
1522 } \
1523 static SIMD_INLINE Mask<TYPE, 64> mask_##OP(const Vec<TYPE, 64> &a, \
1524 const Vec<TYPE, 64> &b) \
1525 { \
1526 return _mm512_##OP##_##SUF##_mask(a, b); \
1527 }
1528
1529#define GENERATE_CMP_WITH_GENERALIZED_FCT(OP, TYPE, SUF, IMM8) \
1530 static SIMD_INLINE Mask<TYPE, 64> mask_##OP( \
1531 const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, const Vec<TYPE, 64> &b) \
1532 { \
1533 return _mm512_mask_cmp_##SUF##_mask(k, a, b, IMM8); \
1534 } \
1535 static SIMD_INLINE Mask<TYPE, 64> mask_##OP(const Vec<TYPE, 64> &a, \
1536 const Vec<TYPE, 64> &b) \
1537 { \
1538 return _mm512_cmp_##SUF##_mask(a, b, IMM8); \
1539 }
1540
1541// ---------------------------------------------------------------------------
1542// masked compare < v
1543// ---------------------------------------------------------------------------
1544
1545#ifdef __AVX512BW__
1546GENERATE_CMP(cmplt, Byte, epu8)
1547GENERATE_CMP(cmplt, SignedByte, epi8)
1548GENERATE_CMP(cmplt, Word, epu16)
1549GENERATE_CMP(cmplt, Short, epi16)
1550#endif
1551GENERATE_CMP(cmplt, Int, epi32)
1552GENERATE_CMP(cmplt, Long, epi64)
1553
1554GENERATE_CMP_WITH_GENERALIZED_FCT(cmplt, Float, ps, _CMP_LT_OS)
1555GENERATE_CMP_WITH_GENERALIZED_FCT(cmplt, Double, pd, _CMP_LT_OS)
1556
1557// ---------------------------------------------------------------------------
1558// masked compare <= v
1559// ---------------------------------------------------------------------------
1560
1561#ifdef __AVX512BW__
1562GENERATE_CMP(cmple, Byte, epu8)
1563GENERATE_CMP(cmple, SignedByte, epi8)
1564GENERATE_CMP(cmple, Word, epu16)
1565GENERATE_CMP(cmple, Short, epi16)
1566#endif
1567GENERATE_CMP(cmple, Int, epi32)
1568GENERATE_CMP(cmple, Long, epi64)
1569
1570GENERATE_CMP_WITH_GENERALIZED_FCT(cmple, Float, ps, _CMP_LE_OS)
1571GENERATE_CMP_WITH_GENERALIZED_FCT(cmple, Double, pd, _CMP_LE_OS)
1572
1573// ---------------------------------------------------------------------------
1574// masked compare == v
1575// ---------------------------------------------------------------------------
1576
1577#ifdef __AVX512BW__
1578GENERATE_CMP(cmpeq, Byte, epu8)
1579GENERATE_CMP(cmpeq, SignedByte, epi8)
1580GENERATE_CMP(cmpeq, Word, epu16)
1581GENERATE_CMP(cmpeq, Short, epi16)
1582#endif
1583GENERATE_CMP(cmpeq, Int, epi32)
1584GENERATE_CMP(cmpeq, Long, epi64)
1585
1586GENERATE_CMP_WITH_GENERALIZED_FCT(cmpeq, Float, ps, _CMP_EQ_OQ)
1587GENERATE_CMP_WITH_GENERALIZED_FCT(cmpeq, Double, pd, _CMP_EQ_OQ)
1588
1589// ---------------------------------------------------------------------------
1590// masked compare > v
1591// ---------------------------------------------------------------------------
1592
1593#ifdef __AVX512BW__
1594GENERATE_CMP(cmpgt, Byte, epu8)
1595GENERATE_CMP(cmpgt, SignedByte, epi8)
1596GENERATE_CMP(cmpgt, Word, epu16)
1597GENERATE_CMP(cmpgt, Short, epi16)
1598#endif
1599GENERATE_CMP(cmpgt, Int, epi32)
1600GENERATE_CMP(cmpgt, Long, epi64)
1601
1602GENERATE_CMP_WITH_GENERALIZED_FCT(cmpgt, Float, ps, _CMP_GT_OS)
1603GENERATE_CMP_WITH_GENERALIZED_FCT(cmpgt, Double, pd, _CMP_GT_OS)
1604
1605// ---------------------------------------------------------------------------
1606// masked compare >= v
1607// ---------------------------------------------------------------------------
1608
1609#ifdef __AVX512BW__
1610GENERATE_CMP(cmpge, Byte, epu8)
1611GENERATE_CMP(cmpge, SignedByte, epi8)
1612GENERATE_CMP(cmpge, Word, epu16)
1613GENERATE_CMP(cmpge, Short, epi16)
1614#endif
1615GENERATE_CMP(cmpge, Int, epi32)
1616GENERATE_CMP(cmpge, Long, epi64)
1617
1618GENERATE_CMP_WITH_GENERALIZED_FCT(cmpge, Float, ps, _CMP_GE_OS)
1619GENERATE_CMP_WITH_GENERALIZED_FCT(cmpge, Double, pd, _CMP_GE_OS)
1620
1621// ---------------------------------------------------------------------------
1622// masked compare != v
1623// ---------------------------------------------------------------------------
1624
1625#ifdef __AVX512BW__
1626GENERATE_CMP(cmpneq, Byte, epu8)
1627GENERATE_CMP(cmpneq, SignedByte, epi8)
1628GENERATE_CMP(cmpneq, Word, epu16)
1629GENERATE_CMP(cmpneq, Short, epi16)
1630#endif
1631GENERATE_CMP(cmpneq, Int, epi32)
1632GENERATE_CMP(cmpneq, Long, epi64)
1633
1634GENERATE_CMP_WITH_GENERALIZED_FCT(cmpneq, Float, ps, _CMP_NEQ_OQ)
1635GENERATE_CMP_WITH_GENERALIZED_FCT(cmpneq, Double, pd, _CMP_NEQ_OQ)
1636
1637// ---------------------------------------------------------------------------
1638// masked avg: average with rounding down v
1639// ---------------------------------------------------------------------------
1640
1641#ifdef __AVX512BW__
1642static SIMD_INLINE Vec<Byte, 64> mask_avg(const Vec<Byte, 64> &src,
1643 const Mask<Byte, 64> &k,
1644 const Vec<Byte, 64> &a,
1645 const Vec<Byte, 64> &b)
1646{
1647 return _mm512_mask_avg_epu8(src, k, a, b);
1648}
1649
1650static SIMD_INLINE Vec<Byte, 64> maskz_avg(const Mask<Byte, 64> &k,
1651 const Vec<Byte, 64> &a,
1652 const Vec<Byte, 64> &b)
1653{
1654 return _mm512_maskz_avg_epu8(k, a, b);
1655}
1656
1657static SIMD_INLINE Vec<Word, 64> mask_avg(const Vec<Word, 64> &src,
1658 const Mask<Word, 64> &k,
1659 const Vec<Word, 64> &a,
1660 const Vec<Word, 64> &b)
1661{
1662 return _mm512_mask_avg_epu16(src, k, a, b);
1663}
1664
1665static SIMD_INLINE Vec<Word, 64> maskz_avg(const Mask<Word, 64> &k,
1666 const Vec<Word, 64> &a,
1667 const Vec<Word, 64> &b)
1668{
1669 return _mm512_maskz_avg_epu16(k, a, b);
1670}
1671#endif
1672
1673// Paul R at
1674// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
1675
1676template <typename T,
1677 SIMD_ENABLE_IF(std::is_integral<T>::value &&std::is_signed<T>::value)>
1678static SIMD_INLINE Vec<T, 64> mask_avg(const Vec<T, 64> &src,
1679 const Mask<T, 64> &k,
1680 const Vec<T, 64> &a, const Vec<T, 64> &b)
1681{
1682 const auto one = ::simd::set1<T, 64>(1);
1683 const auto lsb = bit_and(bit_or(a, b), one);
1684 const auto as = srai<1>(a);
1685 const auto bs = srai<1>(b);
1686 return internal::mask::mask_add(src, k, lsb, add(as, bs));
1687}
1688
1689template <typename T,
1690 SIMD_ENABLE_IF(std::is_integral<T>::value &&std::is_signed<T>::value)>
1691static SIMD_INLINE Vec<T, 64> maskz_avg(const Mask<T, 64> &k,
1692 const Vec<T, 64> &a,
1693 const Vec<T, 64> &b)
1694{
1695 const auto one = ::simd::set1<T, 64>(1);
1696 const auto lsb = bit_and(bit_or(a, b), one);
1697 const auto as = srai<1>(a);
1698 const auto bs = srai<1>(b);
1699 return internal::mask::maskz_add(k, lsb, add(as, bs));
1700}
1701
1702// NOTE: Float version doesn't round!
1703static SIMD_INLINE Vec<Float, 64> mask_avg(const Vec<Float, 64> &src,
1704 const Mask<Float, 64> &k,
1705 const Vec<Float, 64> &a,
1706 const Vec<Float, 64> &b)
1707{
1708 return _mm512_mask_mul_ps(src, k, _mm512_maskz_add_ps(k, a, b),
1709 _mm512_set1_ps(0.5f));
1710}
1711
1712// NOTE: Float version doesn't round!
1713static SIMD_INLINE Vec<Float, 64> maskz_avg(const Mask<Float, 64> &k,
1714 const Vec<Float, 64> &a,
1715 const Vec<Float, 64> &b)
1716{
1717 return _mm512_maskz_mul_ps(k, _mm512_maskz_add_ps(k, a, b),
1718 _mm512_set1_ps(0.5f));
1719}
1720
1721// NOTE: Double version doesn't round!
1722static SIMD_INLINE Vec<Double, 64> mask_avg(const Vec<Double, 64> &src,
1723 const Mask<Double, 64> &k,
1724 const Vec<Double, 64> &a,
1725 const Vec<Double, 64> &b)
1726{
1727 return _mm512_mask_mul_pd(src, k, _mm512_maskz_add_pd(k, a, b),
1728 _mm512_set1_pd(0.5));
1729}
1730
1731// NOTE: Double version doesn't round!
1732static SIMD_INLINE Vec<Double, 64> maskz_avg(const Mask<Double, 64> &k,
1733 const Vec<Double, 64> &a,
1734 const Vec<Double, 64> &b)
1735{
1736 return _mm512_maskz_mul_pd(k, _mm512_maskz_add_pd(k, a, b),
1737 _mm512_set1_pd(0.5));
1738}
1739
1740// ---------------------------------------------------------------------------
1741// masked test_all_zeros v
1742// ---------------------------------------------------------------------------
1743
1744#define TEST_ALL_ZEROS(TYPE, SUF) \
1745 static SIMD_INLINE bool mask_test_all_zeros(const Mask<TYPE, 64> &k, \
1746 const Vec<TYPE, 64> &a) \
1747 { \
1748 return (_mm512_mask_test_epi##SUF##_mask(k, a, a) == 0); \
1749 }
1750
1751#ifdef __AVX512BW__
1752TEST_ALL_ZEROS(Byte, 8)
1753TEST_ALL_ZEROS(SignedByte, 8)
1754TEST_ALL_ZEROS(Word, 16)
1755TEST_ALL_ZEROS(Short, 16)
1756#endif
1757TEST_ALL_ZEROS(Int, 32)
1758TEST_ALL_ZEROS(Long, 64)
1759
1760static SIMD_INLINE bool mask_test_all_zeros(const Mask<Float, 64> &k,
1761 const Vec<Float, 64> &a)
1762{
1763 return (_mm512_mask_test_epi32_mask(k, _mm512_castps_si512(a),
1764 _mm512_castps_si512(a)) == 0);
1765}
1766
1767static SIMD_INLINE bool mask_test_all_zeros(const Mask<Double, 64> &k,
1768 const Vec<Double, 64> &a)
1769{
1770 return (_mm512_mask_test_epi64_mask(k, _mm512_castpd_si512(a),
1771 _mm512_castpd_si512(a)) == 0);
1772}
1773
1774// ---------------------------------------------------------------------------
1775// masked test_all_ones v
1776// ---------------------------------------------------------------------------
1777
1778// already defined in SIMDVecMaskImplEmu.H
1779
1780// ---------------------------------------------------------------------------
1781// mask_all_ones v
1782// ---------------------------------------------------------------------------
1783
1784#define MASK_ALL_ONES(TYPE, MASK) \
1785 static SIMD_INLINE Mask<TYPE, 64> mask_all_ones(OutputType<TYPE>, \
1786 Integer<64>) \
1787 { \
1788 return MASK; \
1789 }
1790
1791#ifdef __AVX512BW__
1792MASK_ALL_ONES(Byte, 0xFFFFFFFFFFFFFFFF)
1793MASK_ALL_ONES(SignedByte, 0xFFFFFFFFFFFFFFFF)
1794MASK_ALL_ONES(Word, 0xFFFFFFFF)
1795MASK_ALL_ONES(Short, 0xFFFFFFFF)
1796#endif
1797MASK_ALL_ONES(Int, 0xFFFF)
1798MASK_ALL_ONES(Float, 0xFFFF)
1799MASK_ALL_ONES(Long, 0xFF)
1800MASK_ALL_ONES(Double, 0xFF)
1801
1802/*
1803Short explanation:
1804Intrinsics (e.g. _kand_mask16, _kor_mask32) are only available for gcc versions
1805>= 7. The intrinsics for __mmask32 and __mmask64 are only available under
1806AVX512BW Intrinsics with a different name and only for __mmask16 (e.g.
1807_mm512_kand) are available for gcc versions >= 6 If AVX512BW is not available,
1808the Byte/SignedByte/Word/Short masks are vectors, then the vector functions
1809are used The last resort (because it is probably slower in most cases) is to
1810emulate the functions with normal operators (e.g. "+" for kadd, "<<" for
1811kshiftl, "&" for kand)
1812*/
1813
1814#if __GNUC__ >= 7 // TODO other compilers (not really a problem, then the
1815 // intrinsics will just not be used)
1816#define GENERATE_DMASKOP(NAME, TYPE, NUM) \
1817 static SIMD_INLINE Mask<TYPE, 64> k##NAME(const Mask<TYPE, 64> &a, \
1818 const Mask<TYPE, 64> &b) \
1819 { \
1820 return _k##NAME##_mask##NUM(a, b); \
1821 }
1822
1823#define KNOT(TYPE, NUM) \
1824 static SIMD_INLINE Mask<TYPE, 64> knot(const Mask<TYPE, 64> &a) \
1825 { \
1826 return _knot_mask##NUM(a); \
1827 }
1828
1829// shift with template parameter
1830#define KSHIFT(R_OR_L, TYPE, NUM) \
1831 template <size_t COUNT> \
1832 static SIMD_INLINE Mask<TYPE, 64> kshift##R_OR_L##i(const Mask<TYPE, 64> &a) \
1833 { \
1834 return _kshift##R_OR_L##i_mask##NUM(a, COUNT); \
1835 }
1836#ifdef __AVX512BW__
1837GENERATE_DMASKOP(and, Byte, 64)
1838GENERATE_DMASKOP(and, SignedByte, 64)
1839GENERATE_DMASKOP(and, Word, 32)
1840GENERATE_DMASKOP(and, Short, 32)
1841
1842GENERATE_DMASKOP(andn, Byte, 64)
1843GENERATE_DMASKOP(andn, SignedByte, 64)
1844GENERATE_DMASKOP(andn, Word, 32)
1845GENERATE_DMASKOP(andn, Short, 32)
1846
1847GENERATE_DMASKOP(or, Byte, 64)
1848GENERATE_DMASKOP(or, SignedByte, 64)
1849GENERATE_DMASKOP(or, Word, 32)
1850GENERATE_DMASKOP(or, Short, 32)
1851
1852GENERATE_DMASKOP(xor, Byte, 64)
1853GENERATE_DMASKOP(xor, SignedByte, 64)
1854GENERATE_DMASKOP(xor, Word, 32)
1855GENERATE_DMASKOP(xor, Short, 32)
1856
1857GENERATE_DMASKOP(xnor, Byte, 64)
1858GENERATE_DMASKOP(xnor, SignedByte, 64)
1859GENERATE_DMASKOP(xnor, Word, 32)
1860GENERATE_DMASKOP(xnor, Short, 32)
1861
1862GENERATE_DMASKOP(add, Byte, 64)
1863GENERATE_DMASKOP(add, SignedByte, 64)
1864GENERATE_DMASKOP(add, Word, 32)
1865GENERATE_DMASKOP(add, Short, 32)
1866
1867KNOT(Byte, 64)
1868KNOT(SignedByte, 64)
1869KNOT(Word, 32)
1870KNOT(Short, 32)
1871
1872KSHIFT(r, Byte, 64)
1873KSHIFT(r, SignedByte, 64)
1874KSHIFT(r, Word, 32)
1875KSHIFT(r, Short, 32)
1876KSHIFT(l, Byte, 64)
1877KSHIFT(l, SignedByte, 64)
1878KSHIFT(l, Word, 32)
1879KSHIFT(l, Short, 32)
1880// else-case is further down
1881#endif // ifdef __AVX512BW__
1882
1883GENERATE_DMASKOP(and, Int, 16)
1884GENERATE_DMASKOP(and, Float, 16)
1885GENERATE_DMASKOP(and, Long, 8)
1886GENERATE_DMASKOP(and, Double, 8)
1887
1888GENERATE_DMASKOP(andn, Int, 16)
1889GENERATE_DMASKOP(andn, Float, 16)
1890GENERATE_DMASKOP(andn, Long, 8)
1891GENERATE_DMASKOP(andn, Double, 8)
1892
1893GENERATE_DMASKOP(or, Int, 16)
1894GENERATE_DMASKOP(or, Float, 16)
1895GENERATE_DMASKOP(or, Long, 8)
1896GENERATE_DMASKOP(or, Double, 8)
1897
1898GENERATE_DMASKOP(xor, Int, 16)
1899GENERATE_DMASKOP(xor, Float, 16)
1900GENERATE_DMASKOP(xor, Long, 8)
1901GENERATE_DMASKOP(xor, Double, 8)
1902
1903GENERATE_DMASKOP(xnor, Int, 16)
1904GENERATE_DMASKOP(xnor, Float, 16)
1905GENERATE_DMASKOP(xnor, Long, 8)
1906GENERATE_DMASKOP(xnor, Double, 8)
1907
1908#ifdef __AVX512DQ__ // _kadd_mask16 and _kadd_mask8 are only available unter
1909 // AVX512DQ
1910GENERATE_DMASKOP(add, Int, 16)
1911GENERATE_DMASKOP(add, Float, 16)
1912GENERATE_DMASKOP(add, Long, 8)
1913GENERATE_DMASKOP(add, Double, 8)
1914#endif
1915
1916KNOT(Int, 16)
1917KNOT(Float, 16)
1918KNOT(Long, 8)
1919KNOT(Double, 8)
1920
1921KSHIFT(r, Int, 16)
1922KSHIFT(r, Float, 16)
1923KSHIFT(r, Long, 8)
1924KSHIFT(r, Double, 8)
1925
1926KSHIFT(l, Int, 16)
1927KSHIFT(l, Float, 16)
1928KSHIFT(l, Long, 8)
1929KSHIFT(l, Double, 8)
1930#else
1931//(__GNUC__ >= 7) is false
1932#if __GNUC__ >= 6
1933// At least the intrinsics for 16- and 8-masks (Int, Float, Long and Double) are
1934// defined.
1935#define GENERATE_DMASKOP(NAME, TYPE, NUM) \
1936 static SIMD_INLINE Mask<TYPE, 64> k##NAME(const Mask<TYPE, 64> &a, \
1937 const Mask<TYPE, 64> &b) \
1938 { \
1939 return _mm512_k##NAME(a, b); \
1940 }
1941
1942#define KNOT(TYPE, NUM) \
1943 static SIMD_INLINE Mask<TYPE, 64> knot(const Mask<TYPE, 64> &a) \
1944 { \
1945 return _mm512_knot(a); \
1946 }
1947GENERATE_DMASKOP(and, Int, 16)
1948GENERATE_DMASKOP(and, Float, 16)
1949GENERATE_DMASKOP(and, Long, 8)
1950GENERATE_DMASKOP(and, Double, 8)
1951
1952GENERATE_DMASKOP(andn, Int, 16)
1953GENERATE_DMASKOP(andn, Float, 16)
1954GENERATE_DMASKOP(andn, Long, 8)
1955GENERATE_DMASKOP(andn, Double, 8)
1956
1957GENERATE_DMASKOP(or, Int, 16)
1958GENERATE_DMASKOP(or, Float, 16)
1959GENERATE_DMASKOP(or, Long, 8)
1960GENERATE_DMASKOP(or, Double, 8)
1961
1962GENERATE_DMASKOP(xor, Int, 16)
1963GENERATE_DMASKOP(xor, Float, 16)
1964GENERATE_DMASKOP(xor, Long, 8)
1965GENERATE_DMASKOP(xor, Double, 8)
1966
1967GENERATE_DMASKOP(xnor, Int, 16)
1968GENERATE_DMASKOP(xnor, Float, 16)
1969GENERATE_DMASKOP(xnor, Long, 8)
1970GENERATE_DMASKOP(xnor, Double, 8)
1971
1972KNOT(Int, 16)
1973KNOT(Float, 16)
1974KNOT(Long, 8)
1975KNOT(Double, 8)
1976#endif
1977
1978template <typename T>
1979static SIMD_INLINE Mask<T, 64> kand(const Mask<T, 64> &a, const Mask<T, 64> &b)
1980{
1981 return (a & b);
1982}
1983
1984template <typename T>
1985static SIMD_INLINE Mask<T, 64> kandn(const Mask<T, 64> &a, const Mask<T, 64> &b)
1986{
1987 return (~a) & b;
1988}
1989
1990template <typename T>
1991static SIMD_INLINE Mask<T, 64> kor(const Mask<T, 64> &a, const Mask<T, 64> &b)
1992{
1993 return (a | b);
1994}
1995
1996template <typename T>
1997static SIMD_INLINE Mask<T, 64> kxor(const Mask<T, 64> &a, const Mask<T, 64> &b)
1998{
1999 return (a ^ b);
2000}
2001
2002template <typename T>
2003static SIMD_INLINE Mask<T, 64> kxnor(const Mask<T, 64> &a, const Mask<T, 64> &b)
2004{
2005 return ~(a ^ b);
2006}
2007
2008template <typename T>
2009static SIMD_INLINE Mask<T, 64> kadd(const Mask<T, 64> &a, const Mask<T, 64> &b)
2010{
2011 return (a + b);
2012}
2013
2014template <typename T>
2015static SIMD_INLINE Mask<T, 64> knot(const Mask<T, 64> &a)
2016{
2017 return ~a;
2018}
2019
2020template <size_t COUNT, typename T>
2021static SIMD_INLINE Mask<T, 64> kshiftri(const Mask<T, 64> &a)
2022{
2023 // 04. Aug 22 (Jonas Keller):
2024 // return zero if COUNT is larger than 63, since then the >> operator is
2025 // undefined, but kshift should return zero
2026 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
2027 // since COUNT is a constant, the compiler should optimize away the
2028 // if-statement
2029 if (COUNT >= 64) { return 0; }
2030// we checked that COUNT is not too large above, disable warning
2031#pragma GCC diagnostic push
2032#pragma GCC diagnostic ignored "-Wshift-count-overflow"
2033 return ((uint64_t) a) >> ((uint64_t) COUNT);
2034#pragma GCC diagnostic pop
2035}
2036
2037template <size_t COUNT, typename T>
2038static SIMD_INLINE Mask<T, 64> kshiftli(const Mask<T, 64> &a)
2039{
2040 // 04. Aug 22 (Jonas Keller):
2041 // return zero if COUNT is larger than 63, since then the << operator is
2042 // undefined, but kshift should return zero
2043 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
2044 // since COUNT is a constant, the compiler should optimize away the
2045 // if-statement
2046 if (COUNT >= 64) { return 0; }
2047// we checked that COUNT is not too large above, disable warning
2048#pragma GCC diagnostic push
2049#pragma GCC diagnostic ignored "-Wshift-count-overflow"
2050 return ((uint64_t) a) << ((uint64_t) COUNT);
2051#pragma GCC diagnostic pop
2052}
2053#endif // if __GNUC__ >= 7
2054
2055// shift with flexible parameter (not template), probably slower than
2056// template-version
2057/*//TODO faster implementation with switch-case possible?
2058#define SHIFT_CASE(OP, NUM) case : OP<NUM>(a); break;
2059
2060#define EMULATE_KSHIFT(R_OR_L, OP, TYPE) \
2061static SIMD_INLINE Mask<TYPE, 64> \
2062kshift ## R_OR_L ## i (const Mask<TYPE, 64> &a, \
2063 uint64_t count) \
2064{ \
2065 return (a OP count); \
2066 switch(count) { \
2067 SHIFT_CASE(OP2, 0) \
2068 SHIFT_CASE(OP2, 1) \
2069 } \
2070}*/
2071
2072template <typename T>
2073static SIMD_INLINE Mask<T, 64> kshiftli(const Mask<T, 64> &a, uint64_t count)
2074{
2075 // 04. Aug 22 (Jonas Keller):
2076 // return zero if count is larger than 63, since then the << operator is
2077 // undefined, but kshift should return zero
2078 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
2079 if (count >= 64) { return Mask<T, 64>(0); }
2080 return Mask<T, 64>(((uint64_t) a) << count);
2081}
2082
2083template <typename T>
2084static SIMD_INLINE Mask<T, 64> kshiftri(const Mask<T, 64> &a, uint64_t count)
2085{
2086 // 04. Aug 22 (Jonas Keller):
2087 // return zero if count is larger than 63, since then the >> operator is
2088 // undefined, but kshift should return zero
2089 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kshift
2090 if (count >= 64) { return Mask<T, 64>(0); }
2091 return Mask<T, 64>(((uint64_t) a) >> count);
2092}
2093
2094// 07. Aug 23 (Jonas Keller): added mask_test_all_zeros/ones.
2095
2096template <typename T>
2097static SIMD_INLINE bool mask_test_all_zeros(const Mask<T, 64> &a)
2098{
2099 return a == 0;
2100}
2101
2102template <typename T>
2103static SIMD_INLINE bool mask_test_all_ones(const Mask<T, 64> &a)
2104{
2105 return a == mask_all_ones(OutputType<T>(), Integer<64>());
2106}
2107
2108// 07. Aug 23 (Jonas Keller): added kcmpeq
2109
2110template <typename T>
2111static SIMD_INLINE Mask<T, 64> kcmpeq(const Mask<T, 64> &a,
2112 const Mask<T, 64> &b)
2113{
2114 return a == b;
2115}
2116} // namespace mask
2117} // namespace internal
2118} // namespace simd
2119
2120#endif
2121
2122#endif // SIMD_VEC_MASK_IMPL_INTEL_64_H_
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
static Vec< T, SIMD_WIDTH > setzero()
Returns a Vec with all elements set to zero.
Definition base.H:70
static Vec< T, SIMD_WIDTH > set1(const dont_deduce< T > a)
Returns a Vec with all elements set to the same value.
Definition base.H:88
Namespace for T-SIMD.
Definition time_measurement.H:161