32#ifndef SIMD_VEC_MASK_IMPL_INTEL_64_H_
33#define SIMD_VEC_MASK_IMPL_INTEL_64_H_
37#include "../mask_impl_emu.H"
40#include "base_impl_intel64.H"
41#include "intrins_intel.H"
48#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_64_AVAIL_) && \
49 !defined(SIMDVEC_SANDBOX)
52#define CLASS_MASK(TYPE, MASK_SIZE) \
54 class Mask<TYPE, 64> \
56 __mmask##MASK_SIZE k; \
63 SIMD_INLINE Mask(const __mmask##MASK_SIZE &x) \
67 explicit SIMD_INLINE Mask(const Vec<TYPE, 64> &x) \
71 Mask &operator=(const __mmask##MASK_SIZE &x) \
76 SIMD_INLINE operator __mmask##MASK_SIZE() const \
80 explicit SIMD_INLINE operator Vec<TYPE, 64>() const \
82 return int2bits<TYPE, 64>(k); \
84 SIMD_INLINE bool operator[](const uint8_t i) const \
86 return ((1lu << i) & k) != 0; \
88 SIMD_INLINE bool operator==(const Mask<TYPE, 64> &x) const \
107#define MASK_SOP(OP, TYPE, SUF) \
108 static SIMD_INLINE Vec<TYPE, 64> mask_##OP( \
109 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a) \
111 return _mm512_mask_##OP##_##SUF(src, k, a); \
114#define MASKZ_SOP(OP, TYPE, SUF) \
115 static SIMD_INLINE Vec<TYPE, 64> maskz_##OP(const Mask<TYPE, 64> &k, \
116 const Vec<TYPE, 64> &a) \
118 return _mm512_maskz_##OP##_##SUF(k, a); \
124#define GENERATE_SOP(OP, TYPE, SUF) \
125 MASK_SOP(OP, TYPE, SUF) MASKZ_SOP(OP, TYPE, SUF)
127#define MASK_DOP(OP, TYPE, SUF) \
128 static SIMD_INLINE Vec<TYPE, 64> mask_##OP( \
129 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, \
130 const Vec<TYPE, 64> &b) \
132 return _mm512_mask_##OP##_##SUF(src, k, a, b); \
135#define MASKZ_DOP(OP, TYPE, SUF) \
136 static SIMD_INLINE Vec<TYPE, 64> maskz_##OP( \
137 const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, const Vec<TYPE, 64> &b) \
139 return _mm512_maskz_##OP##_##SUF(k, a, b); \
145#define GENERATE_DOP(OP, TYPE, SUF) \
146 MASK_DOP(OP, TYPE, SUF) MASKZ_DOP(OP, TYPE, SUF)
155#define MASK_IFELSE(TYPE, SUF, REG) \
156 static SIMD_INLINE Vec<TYPE, 64> mask_ifelse(const Mask<TYPE, 64> &cond, \
157 const Vec<TYPE, 64> &a, \
158 const Vec<TYPE, 64> &b) \
160 return (REG) _mm512_mask_blend_##SUF(cond, (REG) b, (REG) a); \
164MASK_IFELSE(Byte, epi8, __m512i)
165MASK_IFELSE(SignedByte, epi8, __m512i)
166MASK_IFELSE(Word, epi16, __m512i)
167MASK_IFELSE(Short, epi16, __m512i)
169MASK_IFELSE(Int, epi32, __m512i)
170MASK_IFELSE(Float, ps, __m512)
171MASK_IFELSE(Long, epi64, __m512i)
172MASK_IFELSE(Double, pd, __m512d)
179#define MASK_IFELSEZERO(TYPE) \
180 static SIMD_INLINE Vec<TYPE, 64> mask_ifelsezero( \
181 const Mask<TYPE, 64> &cond, const Vec<TYPE, 64> &trueVal) \
183 return mask_ifelse(cond, trueVal, ::simd::setzero<TYPE, 64>()); \
188MASK_IFELSEZERO(SignedByte)
190MASK_IFELSEZERO(Short)
193MASK_IFELSEZERO(Float)
195MASK_IFELSEZERO(Double)
203template <
typename Tout,
typename Tin>
204static SIMD_INLINE Mask<Tout, 64> reinterpret_mask(
const Mask<Tin, 64> &k)
206 static_assert(
sizeof(Tout) ==
sizeof(Tin),
"");
207 return Mask<Tout, 64>(k.k);
221SIMD_INLINE Vec<Int, 64> maskz_cvts(
const Mask<Float, 64> &k,
222 const Vec<Float, 64> &a)
224 __m512 clip = _mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
225 return _mm512_maskz_cvtps_epi32(k, _mm512_maskz_min_ps(k, clip, a));
228SIMD_INLINE Vec<Int, 64> mask_cvts(
const Vec<Int, 64> &src,
229 const Mask<Float, 64> &k,
230 const Vec<Float, 64> &a)
232 __m512 clip = _mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
233 return _mm512_mask_cvtps_epi32(src, k, _mm512_maskz_min_ps(k, clip, a));
237SIMD_INLINE Vec<Float, 64> maskz_cvts(
const Mask<Int, 64> &k,
238 const Vec<Int, 64> &a)
240 return _mm512_maskz_cvtepi32_ps(k, a);
244SIMD_INLINE Vec<Float, 64> mask_cvts(
const Vec<Float, 64> &src,
245 const Mask<Int, 64> &k,
246 const Vec<Int, 64> &a)
248 return _mm512_mask_cvtepi32_ps(src, k, a);
255#define GENERATE_SET1(TYPE, SUF) \
256 static SIMD_INLINE Vec<TYPE, 64> mask_set1( \
257 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const TYPE a) \
259 return _mm512_mask_set1_##SUF(src, k, a); \
261 static SIMD_INLINE Vec<TYPE, 64> maskz_set1(const Mask<TYPE, 64> &k, \
264 return _mm512_maskz_set1_##SUF(k, a); \
268GENERATE_SET1(Byte, epi8)
269GENERATE_SET1(SignedByte, epi8)
270GENERATE_SET1(Word, epi16)
271GENERATE_SET1(Short, epi16)
273GENERATE_SET1(Int, epi32)
274GENERATE_SET1(Long, epi64)
276static SIMD_INLINE Vec<Float, 64> mask_set1(
const Vec<Float, 64> &src,
277 const Mask<Float, 64> &k,
280 return _mm512_castsi512_ps(
281 _mm512_mask_set1_epi32(_mm512_castps_si512(src), k, bit_cast<Int>(a)));
283static SIMD_INLINE Vec<Float, 64> maskz_set1(
const Mask<Float, 64> &k,
286 return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(k, bit_cast<Int>(a)));
289static SIMD_INLINE Vec<Double, 64> mask_set1(
const Vec<Double, 64> &src,
290 const Mask<Double, 64> &k,
293 return _mm512_castsi512_pd(
294 _mm512_mask_set1_epi64(_mm512_castpd_si512(src), k, bit_cast<Long>(a)));
296static SIMD_INLINE Vec<Double, 64> maskz_set1(
const Mask<Double, 64> &k,
299 return _mm512_castsi512_pd(_mm512_maskz_set1_epi64(k, bit_cast<Long>(a)));
306#define GENERATE_LOAD(NAME, TYPE, SUF) \
307 static SIMD_INLINE Vec<TYPE, 64> mask_load( \
308 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const TYPE *const p) \
312 SIMD_CHECK_ALIGNMENT(p, 64); \
313 return _mm512_mask_##NAME##_##SUF(src, k, p); \
315 static SIMD_INLINE Vec<TYPE, 64> maskz_load(const Mask<TYPE, 64> &k, \
316 const TYPE *const p) \
320 SIMD_CHECK_ALIGNMENT(p, 64); \
321 return _mm512_maskz_##NAME##_##SUF(k, p); \
326GENERATE_LOAD(loadu, Byte, epi8)
327GENERATE_LOAD(loadu, SignedByte, epi8)
328GENERATE_LOAD(loadu, Word, epi16)
329GENERATE_LOAD(loadu, Short, epi16)
332GENERATE_LOAD(load, Int, epi32)
333GENERATE_LOAD(load, Float, ps)
334GENERATE_LOAD(load, Long, epi64)
335GENERATE_LOAD(load, Double, pd)
341#define GENERATE_LOADU(TYPE, SUF) \
342 static SIMD_INLINE Vec<TYPE, 64> mask_loadu( \
343 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const TYPE *const p) \
345 return _mm512_mask_loadu_##SUF(src, k, p); \
347 static SIMD_INLINE Vec<TYPE, 64> maskz_loadu(const Mask<TYPE, 64> &k, \
348 const TYPE *const p) \
350 return _mm512_maskz_loadu_##SUF(k, p); \
354GENERATE_LOADU(Byte, epi8)
355GENERATE_LOADU(SignedByte, epi8)
356GENERATE_LOADU(Word, epi16)
357GENERATE_LOADU(Short, epi16)
360GENERATE_LOADU(Int, epi32)
361GENERATE_LOADU(Float, ps)
362GENERATE_LOADU(Long, epi64)
363GENERATE_LOADU(Double, pd)
371#define MASK_STORE(NAME, TYPE, SUF) \
372 static SIMD_INLINE void mask_store(TYPE *const p, const Mask<TYPE, 64> &k, \
373 const Vec<TYPE, 64> &a) \
377 SIMD_CHECK_ALIGNMENT(p, 64); \
378 return _mm512_mask_##NAME##_##SUF(p, k, a); \
383MASK_STORE(storeu, Byte, epi8)
384MASK_STORE(storeu, SignedByte, epi8)
385MASK_STORE(storeu, Word, epi16)
386MASK_STORE(storeu, Short, epi16)
389MASK_STORE(store, Int, epi32)
390MASK_STORE(store, Float, ps)
391MASK_STORE(store, Long, epi64)
392MASK_STORE(store, Double, pd)
400#define MASK_STOREU(TYPE, SUF) \
401 static SIMD_INLINE void mask_storeu(TYPE *const p, const Mask<TYPE, 64> &k, \
402 const Vec<TYPE, 64> &a) \
404 return _mm512_mask_storeu_##SUF(p, k, a); \
407MASK_STOREU(Byte, epi8)
408MASK_STOREU(SignedByte, epi8)
409MASK_STOREU(Word, epi16)
410MASK_STOREU(Short, epi16)
412MASK_STOREU(Int, epi32)
413MASK_STOREU(Float, ps)
414MASK_STOREU(Long, epi64)
415MASK_STOREU(Double, pd)
422GENERATE_DOP(add, Byte, epi8)
423GENERATE_DOP(add, SignedByte, epi8)
424GENERATE_DOP(add, Word, epi16)
425GENERATE_DOP(add, Short, epi16)
427GENERATE_DOP(add, Int, epi32)
428GENERATE_DOP(add, Float, ps)
429GENERATE_DOP(add, Long, epi64)
430GENERATE_DOP(add, Double, pd)
437GENERATE_DOP(adds, Byte, epu8)
438GENERATE_DOP(adds, SignedByte, epi8)
439GENERATE_DOP(adds, Word, epu16)
440GENERATE_DOP(adds, Short, epi16)
451GENERATE_DOP(sub, Byte, epi8)
452GENERATE_DOP(sub, SignedByte, epi8)
453GENERATE_DOP(sub, Word, epi16)
454GENERATE_DOP(sub, Short, epi16)
456GENERATE_DOP(sub, Int, epi32)
457GENERATE_DOP(sub, Float, ps)
458GENERATE_DOP(sub, Long, epi64)
459GENERATE_DOP(sub, Double, pd)
466GENERATE_DOP(subs, Byte, epu8)
467GENERATE_DOP(subs, SignedByte, epi8)
468GENERATE_DOP(subs, Word, epu16)
469GENERATE_DOP(subs, Short, epi16)
479GENERATE_DOP(mul, Float, ps)
480GENERATE_DOP(mul, Double, pd)
486GENERATE_DOP(div, Float, ps)
487GENERATE_DOP(div, Double, pd)
497template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
498static SIMD_INLINE Vec<T, 64> mask_ceil(
const Vec<T, 64> &src,
499 const Mask<T, 64> &k,
502 return mask_ifelse(k, a, src);
505template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
506static SIMD_INLINE Vec<T, 64> maskz_ceil(
const Mask<T, 64> &k,
509 return mask_ifelsezero(k, a);
512template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
513static SIMD_INLINE Vec<T, 64> mask_floor(
const Vec<T, 64> &src,
514 const Mask<T, 64> &k,
517 return mask_ifelse(k, a, src);
520template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
521static SIMD_INLINE Vec<T, 64> maskz_floor(
const Mask<T, 64> &k,
524 return mask_ifelsezero(k, a);
527template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
528static SIMD_INLINE Vec<T, 64> mask_round(
const Vec<T, 64> &src,
529 const Mask<T, 64> &k,
532 return mask_ifelse(k, a, src);
535template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
536static SIMD_INLINE Vec<T, 64> maskz_round(
const Mask<T, 64> &k,
539 return mask_ifelsezero(k, a);
542template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
543static SIMD_INLINE Vec<T, 64> mask_truncate(
const Vec<T, 64> &src,
544 const Mask<T, 64> &k,
547 return mask_ifelse(k, a, src);
550template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
551static SIMD_INLINE Vec<T, 64> maskz_truncate(
const Mask<T, 64> &k,
554 return mask_ifelsezero(k, a);
559static SIMD_INLINE Vec<Float, 64> mask_ceil(
const Vec<Float, 64> &src,
560 const Mask<Float, 64> &k,
561 const Vec<Float, 64> &a)
563 return _mm512_mask_roundscale_ps(src, k, a,
564 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
567static SIMD_INLINE Vec<Float, 64> maskz_ceil(
const Mask<Float, 64> &k,
568 const Vec<Float, 64> &a)
570 return _mm512_maskz_roundscale_ps(k, a,
571 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
574static SIMD_INLINE Vec<Float, 64> mask_floor(
const Vec<Float, 64> &src,
575 const Mask<Float, 64> &k,
576 const Vec<Float, 64> &a)
578 return _mm512_mask_roundscale_ps(src, k, a,
579 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
582static SIMD_INLINE Vec<Float, 64> maskz_floor(
const Mask<Float, 64> &k,
583 const Vec<Float, 64> &a)
585 return _mm512_maskz_roundscale_ps(k, a,
586 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
589static SIMD_INLINE Vec<Float, 64> mask_round(
const Vec<Float, 64> &src,
590 const Mask<Float, 64> &k,
591 const Vec<Float, 64> &a)
593 return _mm512_mask_roundscale_ps(
594 src, k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
597static SIMD_INLINE Vec<Float, 64> maskz_round(
const Mask<Float, 64> &k,
598 const Vec<Float, 64> &a)
600 return _mm512_maskz_roundscale_ps(
601 k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
604static SIMD_INLINE Vec<Float, 64> mask_truncate(
const Vec<Float, 64> &src,
605 const Mask<Float, 64> &k,
606 const Vec<Float, 64> &a)
608 return _mm512_mask_roundscale_ps(src, k, a,
609 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
612static SIMD_INLINE Vec<Float, 64> maskz_truncate(
const Mask<Float, 64> &k,
613 const Vec<Float, 64> &a)
615 return _mm512_maskz_roundscale_ps(k, a,
616 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
621static SIMD_INLINE Vec<Double, 64> mask_ceil(
const Vec<Double, 64> &src,
622 const Mask<Double, 64> &k,
623 const Vec<Double, 64> &a)
625 return _mm512_mask_roundscale_pd(src, k, a,
626 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
629static SIMD_INLINE Vec<Double, 64> maskz_ceil(
const Mask<Double, 64> &k,
630 const Vec<Double, 64> &a)
632 return _mm512_maskz_roundscale_pd(k, a,
633 _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
636static SIMD_INLINE Vec<Double, 64> mask_floor(
const Vec<Double, 64> &src,
637 const Mask<Double, 64> &k,
638 const Vec<Double, 64> &a)
640 return _mm512_mask_roundscale_pd(src, k, a,
641 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
644static SIMD_INLINE Vec<Double, 64> maskz_floor(
const Mask<Double, 64> &k,
645 const Vec<Double, 64> &a)
647 return _mm512_maskz_roundscale_pd(k, a,
648 _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
651static SIMD_INLINE Vec<Double, 64> mask_round(
const Vec<Double, 64> &src,
652 const Mask<Double, 64> &k,
653 const Vec<Double, 64> &a)
655 return _mm512_mask_roundscale_pd(
656 src, k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
659static SIMD_INLINE Vec<Double, 64> maskz_round(
const Mask<Double, 64> &k,
660 const Vec<Double, 64> &a)
662 return _mm512_maskz_roundscale_pd(
663 k, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
666static SIMD_INLINE Vec<Double, 64> mask_truncate(
const Vec<Double, 64> &src,
667 const Mask<Double, 64> &k,
668 const Vec<Double, 64> &a)
670 return _mm512_mask_roundscale_pd(src, k, a,
671 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
674static SIMD_INLINE Vec<Double, 64> maskz_truncate(
const Mask<Double, 64> &k,
675 const Vec<Double, 64> &a)
677 return _mm512_maskz_roundscale_pd(k, a,
678 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
687static SIMD_INLINE Vec<Float, 64> mask_rcp(
const Vec<Float, 64> &src,
688 const Mask<Float, 64> &k,
689 const Vec<Float, 64> &a)
691 return _mm512_mask_rcp14_ps(src, k, a);
694static SIMD_INLINE Vec<Float, 64> maskz_rcp(
const Mask<Float, 64> &k,
695 const Vec<Float, 64> &a)
697 return _mm512_maskz_rcp14_ps(k, a);
700static SIMD_INLINE Vec<Double, 64> mask_rcp(
const Vec<Double, 64> &src,
701 const Mask<Double, 64> &k,
702 const Vec<Double, 64> &a)
704 return _mm512_mask_rcp14_pd(src, k, a);
707static SIMD_INLINE Vec<Double, 64> maskz_rcp(
const Mask<Double, 64> &k,
708 const Vec<Double, 64> &a)
710 return _mm512_maskz_rcp14_pd(k, a);
715static SIMD_INLINE Vec<Float, 64> mask_rsqrt(
const Vec<Float, 64> &src,
716 const Mask<Float, 64> &k,
717 const Vec<Float, 64> &a)
719 return _mm512_mask_rsqrt14_ps(src, k, a);
722static SIMD_INLINE Vec<Float, 64> maskz_rsqrt(
const Mask<Float, 64> &k,
723 const Vec<Float, 64> &a)
725 return _mm512_maskz_rsqrt14_ps(k, a);
728static SIMD_INLINE Vec<Double, 64> mask_rsqrt(
const Vec<Double, 64> &src,
729 const Mask<Double, 64> &k,
730 const Vec<Double, 64> &a)
732 return _mm512_mask_rsqrt14_pd(src, k, a);
735static SIMD_INLINE Vec<Double, 64> maskz_rsqrt(
const Mask<Double, 64> &k,
736 const Vec<Double, 64> &a)
738 return _mm512_maskz_rsqrt14_pd(k, a);
742GENERATE_SOP(sqrt, Float, ps)
743GENERATE_SOP(sqrt, Double, pd)
752template <
typename T, SIMD_ENABLE_IF(std::is_
unsigned<T>::value
753 &&std::is_
integral<T>::value)>
754static SIMD_INLINE Vec<T, 64> mask_abs(
const Vec<T, 64> &src,
755 const Mask<T, 64> &k,
758 return mask_ifelse(k, a, src);
761template <
typename T, SIMD_ENABLE_IF(std::is_
unsigned<T>::value
762 &&std::is_
integral<T>::value)>
763static SIMD_INLINE Vec<T, 64> maskz_abs(
const Mask<T, 64> &k,
766 return mask_ifelsezero(k, a);
770GENERATE_SOP(abs, SignedByte, epi8)
771GENERATE_SOP(abs, Short, epi16)
773GENERATE_SOP(abs, Int, epi32)
774GENERATE_SOP(abs, Long, epi64)
776MASK_SOP(abs, Float, ps)
779static SIMD_INLINE Vec<Float, 64> maskz_abs(
const Mask<Float, 64> &k,
780 const Vec<Float, 64> &a)
785MASK_SOP(abs, Double, pd)
788static SIMD_INLINE Vec<Double, 64> maskz_abs(
const Mask<Double, 64> &k,
789 const Vec<Double, 64> &a)
799GENERATE_DOP(and, Int, epi32)
800GENERATE_DOP(and, Long, epi64)
802GENERATE_DOP(and, Float, ps)
803GENERATE_DOP(and, Double, pd)
806static SIMD_INLINE Vec<Float, 64> mask_and(
const Vec<Float, 64> &src,
807 const Mask<Float, 64> &k,
808 const Vec<Float, 64> &a,
809 const Vec<Float, 64> &b)
811 return _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_castps_si512(src), k,
812 _mm512_castps_si512(a),
813 _mm512_castps_si512(b)));
816static SIMD_INLINE Vec<Float, 64> maskz_and(
const Mask<Float, 64> &k,
817 const Vec<Float, 64> &a,
818 const Vec<Float, 64> &b)
820 return _mm512_castsi512_ps(
821 _mm512_maskz_and_epi32(k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
824static SIMD_INLINE Vec<Double, 64> mask_and(
const Vec<Double, 64> &src,
825 const Mask<Double, 64> &k,
826 const Vec<Double, 64> &a,
827 const Vec<Double, 64> &b)
829 return _mm512_castsi512_pd(_mm512_mask_and_epi64(_mm512_castpd_si512(src), k,
830 _mm512_castpd_si512(a),
831 _mm512_castpd_si512(b)));
834static SIMD_INLINE Vec<Double, 64> maskz_and(
const Mask<Double, 64> &k,
835 const Vec<Double, 64> &a,
836 const Vec<Double, 64> &b)
838 return _mm512_castsi512_pd(
839 _mm512_maskz_and_epi64(k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
848GENERATE_DOP(or, Int, epi32)
849GENERATE_DOP(or, Long, epi64)
851GENERATE_DOP(or, Float, ps)
852GENERATE_DOP(or, Double, pd)
855static SIMD_INLINE Vec<Float, 64> mask_or(
const Vec<Float, 64> &src,
856 const Mask<Float, 64> &k,
857 const Vec<Float, 64> &a,
858 const Vec<Float, 64> &b)
860 return _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_castps_si512(src), k,
861 _mm512_castps_si512(a),
862 _mm512_castps_si512(b)));
865static SIMD_INLINE Vec<Float, 64> maskz_or(
const Mask<Float, 64> &k,
866 const Vec<Float, 64> &a,
867 const Vec<Float, 64> &b)
869 return _mm512_castsi512_ps(
870 _mm512_maskz_or_epi32(k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
873static SIMD_INLINE Vec<Double, 64> mask_or(
const Vec<Double, 64> &src,
874 const Mask<Double, 64> &k,
875 const Vec<Double, 64> &a,
876 const Vec<Double, 64> &b)
878 return _mm512_castsi512_pd(_mm512_mask_or_epi64(_mm512_castpd_si512(src), k,
879 _mm512_castpd_si512(a),
880 _mm512_castpd_si512(b)));
883static SIMD_INLINE Vec<Double, 64> maskz_or(
const Mask<Double, 64> &k,
884 const Vec<Double, 64> &a,
885 const Vec<Double, 64> &b)
887 return _mm512_castsi512_pd(
888 _mm512_maskz_or_epi64(k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
897GENERATE_DOP(andnot, Int, epi32)
898GENERATE_DOP(andnot, Long, epi64)
900GENERATE_DOP(andnot, Float, ps)
901GENERATE_DOP(andnot, Double, pd)
904static SIMD_INLINE Vec<Float, 64> mask_andnot(
const Vec<Float, 64> &src,
905 const Mask<Float, 64> &k,
906 const Vec<Float, 64> &a,
907 const Vec<Float, 64> &b)
909 return _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_castps_si512(src),
910 k, _mm512_castps_si512(a),
911 _mm512_castps_si512(b)));
914static SIMD_INLINE Vec<Float, 64> maskz_andnot(
const Mask<Float, 64> &k,
915 const Vec<Float, 64> &a,
916 const Vec<Float, 64> &b)
918 return _mm512_castsi512_ps(_mm512_maskz_andnot_epi32(
919 k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
922static SIMD_INLINE Vec<Double, 64> mask_andnot(
const Vec<Double, 64> &src,
923 const Mask<Double, 64> &k,
924 const Vec<Double, 64> &a,
925 const Vec<Double, 64> &b)
927 return _mm512_castsi512_pd(_mm512_mask_andnot_epi64(_mm512_castpd_si512(src),
928 k, _mm512_castpd_si512(a),
929 _mm512_castpd_si512(b)));
932static SIMD_INLINE Vec<Double, 64> maskz_andnot(
const Mask<Double, 64> &k,
933 const Vec<Double, 64> &a,
934 const Vec<Double, 64> &b)
936 return _mm512_castsi512_pd(_mm512_maskz_andnot_epi64(
937 k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
946GENERATE_DOP(xor, Int, epi32)
947GENERATE_DOP(xor, Long, epi64)
949GENERATE_DOP(xor, Float, ps)
950GENERATE_DOP(xor, Double, pd)
953static SIMD_INLINE Vec<Float, 64> mask_xor(
const Vec<Float, 64> &src,
954 const Mask<Float, 64> &k,
955 const Vec<Float, 64> &a,
956 const Vec<Float, 64> &b)
958 return _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(src), k,
959 _mm512_castps_si512(a),
960 _mm512_castps_si512(b)));
963static SIMD_INLINE Vec<Float, 64> maskz_xor(
const Mask<Float, 64> &k,
964 const Vec<Float, 64> &a,
965 const Vec<Float, 64> &b)
967 return _mm512_castsi512_ps(
968 _mm512_maskz_xor_epi32(k, _mm512_castps_si512(a), _mm512_castps_si512(b)));
971static SIMD_INLINE Vec<Double, 64> mask_xor(
const Vec<Double, 64> &src,
972 const Mask<Double, 64> &k,
973 const Vec<Double, 64> &a,
974 const Vec<Double, 64> &b)
976 return _mm512_castsi512_pd(_mm512_mask_xor_epi64(_mm512_castpd_si512(src), k,
977 _mm512_castpd_si512(a),
978 _mm512_castpd_si512(b)));
981static SIMD_INLINE Vec<Double, 64> maskz_xor(
const Mask<Double, 64> &k,
982 const Vec<Double, 64> &a,
983 const Vec<Double, 64> &b)
985 return _mm512_castsi512_pd(
986 _mm512_maskz_xor_epi64(k, _mm512_castpd_si512(a), _mm512_castpd_si512(b)));
1001static SIMD_INLINE Vec<Int, 64> mask_not(
const Vec<Int, 64> &src,
1002 const Mask<Int, 64> &k,
1003 const Vec<Int, 64> &a)
1005 return _mm512_mask_xor_epi32(src, k, a, _mm512_set1_epi32(-1));
1007static SIMD_INLINE Vec<Int, 64> maskz_not(
const Mask<Int, 64> &k,
1008 const Vec<Int, 64> &a)
1010 return _mm512_maskz_xor_epi32(k, a, _mm512_set1_epi32(-1));
1014static SIMD_INLINE Vec<Long, 64> mask_not(
const Vec<Long, 64> &src,
1015 const Mask<Long, 64> &k,
1016 const Vec<Long, 64> &a)
1018 return _mm512_mask_xor_epi64(src, k, a, _mm512_set1_epi64(-1));
1020static SIMD_INLINE Vec<Long, 64> maskz_not(
const Mask<Long, 64> &k,
1021 const Vec<Long, 64> &a)
1023 return _mm512_maskz_xor_epi64(k, a, _mm512_set1_epi64(-1));
1027static SIMD_INLINE Vec<Float, 64> mask_not(
const Vec<Float, 64> &src,
1028 const Mask<Float, 64> &k,
1029 const Vec<Float, 64> &a)
1031 return _mm512_mask_xor_ps(src, k, a,
1032 _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
1034static SIMD_INLINE Vec<Float, 64> maskz_not(
const Mask<Float, 64> &k,
1035 const Vec<Float, 64> &a)
1037 return _mm512_maskz_xor_ps(k, a, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
1041static SIMD_INLINE Vec<Double, 64> mask_not(
const Vec<Double, 64> &src,
1042 const Mask<Double, 64> &k,
1043 const Vec<Double, 64> &a)
1045 return _mm512_mask_xor_pd(src, k, a,
1046 _mm512_castsi512_pd(_mm512_set1_epi64(-1)));
1048static SIMD_INLINE Vec<Double, 64> maskz_not(
const Mask<Double, 64> &k,
1049 const Vec<Double, 64> &a)
1051 return _mm512_maskz_xor_pd(k, a, _mm512_castsi512_pd(_mm512_set1_epi64(-1)));
1055static SIMD_INLINE Vec<Float, 64> mask_not(
const Vec<Float, 64> &src,
1056 const Mask<Float, 64> &k,
1057 const Vec<Float, 64> &a)
1059 return _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(src), k,
1060 _mm512_castps_si512(a),
1061 _mm512_set1_epi32(-1)));
1064static SIMD_INLINE Vec<Float, 64> maskz_not(
const Mask<Float, 64> &k,
1065 const Vec<Float, 64> &a)
1067 return _mm512_castsi512_ps(
1068 _mm512_maskz_xor_epi32(k, _mm512_castps_si512(a), _mm512_set1_epi32(-1)));
1071static SIMD_INLINE Vec<Double, 64> mask_not(
const Vec<Double, 64> &src,
1072 const Mask<Double, 64> &k,
1073 const Vec<Double, 64> &a)
1075 return _mm512_castsi512_pd(_mm512_mask_xor_epi64(_mm512_castpd_si512(src), k,
1076 _mm512_castpd_si512(a),
1077 _mm512_set1_epi64(-1)));
1080static SIMD_INLINE Vec<Double, 64> maskz_not(
const Mask<Double, 64> &k,
1081 const Vec<Double, 64> &a)
1083 return _mm512_castsi512_pd(
1084 _mm512_maskz_xor_epi64(k, _mm512_castpd_si512(a), _mm512_set1_epi64(-1)));
1092#define GENERATE_NEG(TYPE, SUF) \
1093 static SIMD_INLINE Vec<TYPE, 64> mask_neg( \
1094 const Vec<TYPE, 64> &src, const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a) \
1096 return _mm512_mask_sub_##SUF(src, k, setzero<TYPE, 64>(), a); \
1098 static SIMD_INLINE Vec<TYPE, 64> maskz_neg(const Mask<TYPE, 64> &k, \
1099 const Vec<TYPE, 64> &a) \
1101 return _mm512_maskz_sub_##SUF(k, setzero<TYPE, 64>(), a); \
1105GENERATE_NEG(SignedByte, epi8)
1106GENERATE_NEG(Short, epi16)
1108GENERATE_NEG(Int, epi32)
1109GENERATE_NEG(Float, ps)
1110GENERATE_NEG(Long, epi64)
1111GENERATE_NEG(Double, pd)
1118GENERATE_DOP(min, Byte, epu8)
1119GENERATE_DOP(min, SignedByte, epi8)
1120GENERATE_DOP(min, Word, epu16)
1121GENERATE_DOP(min, Short, epi16)
1123GENERATE_DOP(min, Int, epi32)
1124GENERATE_DOP(min, Float, ps)
1125GENERATE_DOP(min, Long, epi64)
1126GENERATE_DOP(min, Double, pd)
1133GENERATE_DOP(max, Byte, epu8)
1134GENERATE_DOP(max, SignedByte, epi8)
1135GENERATE_DOP(max, Word, epu16)
1136GENERATE_DOP(max, Short, epi16)
1138GENERATE_DOP(max, Int, epi32)
1139GENERATE_DOP(max, Float, ps)
1140GENERATE_DOP(max, Long, epi64)
1141GENERATE_DOP(max, Double, pd)
1148template <
size_t COUNT>
1149static SIMD_INLINE Vec<Word, 64> mask_srai(
const Vec<Word, 64> &src,
1150 const Mask<Word, 64> &k,
1151 const Vec<Word, 64> &a)
1153 return _mm512_mask_srai_epi16(src, k, a, vec::min(COUNT, 15ul));
1156template <
size_t COUNT>
1157static SIMD_INLINE Vec<Word, 64> maskz_srai(
const Mask<Word, 64> &k,
1158 const Vec<Word, 64> &a)
1160 return _mm512_maskz_srai_epi16(k, a, vec::min(COUNT, 15ul));
1163template <
size_t COUNT>
1164static SIMD_INLINE Vec<Short, 64> mask_srai(
const Vec<Short, 64> &src,
1165 const Mask<Short, 64> &k,
1166 const Vec<Short, 64> &a)
1168 return _mm512_mask_srai_epi16(src, k, a, vec::min(COUNT, 15ul));
1171template <
size_t COUNT>
1172static SIMD_INLINE Vec<Short, 64> maskz_srai(
const Mask<Short, 64> &k,
1173 const Vec<Short, 64> &a)
1175 return _mm512_maskz_srai_epi16(k, a, vec::min(COUNT, 15ul));
1180template <
size_t COUNT>
1181static SIMD_INLINE Vec<Int, 64> mask_srai(
const Vec<Int, 64> &src,
1182 const Mask<Int, 64> &k,
1183 const Vec<Int, 64> &a)
1185 return _mm512_mask_srai_epi32(src, k, a, vec::min(COUNT, 31ul));
1188template <
size_t COUNT>
1189static SIMD_INLINE Vec<Int, 64> maskz_srai(
const Mask<Int, 64> &k,
1190 const Vec<Int, 64> &a)
1192 return _mm512_maskz_srai_epi32(k, a, vec::min(COUNT, 31ul));
1195template <
size_t COUNT>
1196static SIMD_INLINE Vec<Long, 64> mask_srai(
const Vec<Long, 64> &src,
1197 const Mask<Long, 64> &k,
1198 const Vec<Long, 64> &a)
1200 return _mm512_mask_srai_epi64(src, k, a, vec::min(COUNT, 63ul));
1203template <
size_t COUNT>
1204static SIMD_INLINE Vec<Long, 64> maskz_srai(
const Mask<Long, 64> &k,
1205 const Vec<Long, 64> &a)
1207 return _mm512_maskz_srai_epi64(k, a, vec::min(COUNT, 63ul));
1215template <
size_t COUNT>
1216static SIMD_INLINE Vec<Word, 64> mask_srli(
const Vec<Word, 64> &src,
1217 const Mask<Word, 64> &k,
1218 const Vec<Word, 64> &a)
1220 SIMD_IF_CONSTEXPR (COUNT < 16) {
1221 return _mm512_mask_srli_epi16(src, k, a, COUNT);
1223 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1227template <
size_t COUNT>
1228static SIMD_INLINE Vec<Word, 64> maskz_srli(
const Mask<Word, 64> &k,
1229 const Vec<Word, 64> &a)
1231 SIMD_IF_CONSTEXPR (COUNT < 16) {
1232 return _mm512_maskz_srli_epi16(k, a, COUNT);
1234 return _mm512_setzero_si512();
1238template <
size_t COUNT>
1239static SIMD_INLINE Vec<Short, 64> mask_srli(
const Vec<Short, 64> &src,
1240 const Mask<Short, 64> &k,
1241 const Vec<Short, 64> &a)
1243 SIMD_IF_CONSTEXPR (COUNT < 16) {
1244 return _mm512_mask_srli_epi16(src, k, a, COUNT);
1246 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1250template <
size_t COUNT>
1251static SIMD_INLINE Vec<Short, 64> maskz_srli(
const Mask<Short, 64> &k,
1252 const Vec<Short, 64> &a)
1254 SIMD_IF_CONSTEXPR (COUNT < 16) {
1255 return _mm512_maskz_srli_epi16(k, a, COUNT);
1257 return _mm512_setzero_si512();
1263template <
size_t COUNT>
1264static SIMD_INLINE Vec<Int, 64> mask_srli(
const Vec<Int, 64> &src,
1265 const Mask<Int, 64> &k,
1266 const Vec<Int, 64> &a)
1268 SIMD_IF_CONSTEXPR (COUNT < 32) {
1269 return _mm512_mask_srli_epi32(src, k, a, COUNT);
1271 return _mm512_mask_blend_epi32(k, src, _mm512_setzero_si512());
1275template <
size_t COUNT>
1276static SIMD_INLINE Vec<Int, 64> maskz_srli(
const Mask<Int, 64> &k,
1277 const Vec<Int, 64> &a)
1279 SIMD_IF_CONSTEXPR (COUNT < 32) {
1280 return _mm512_maskz_srli_epi32(k, a, COUNT);
1282 return _mm512_setzero_si512();
1286template <
size_t COUNT>
1287static SIMD_INLINE Vec<Long, 64> mask_srli(
const Vec<Long, 64> &src,
1288 const Mask<Long, 64> &k,
1289 const Vec<Long, 64> &a)
1291 SIMD_IF_CONSTEXPR (COUNT < 64) {
1292 return _mm512_mask_srli_epi64(src, k, a, COUNT);
1294 return _mm512_mask_blend_epi64(k, src, _mm512_setzero_si512());
1298template <
size_t COUNT>
1299static SIMD_INLINE Vec<Long, 64> maskz_srli(
const Mask<Long, 64> &k,
1300 const Vec<Long, 64> &a)
1302 SIMD_IF_CONSTEXPR (COUNT < 64) {
1303 return _mm512_maskz_srli_epi64(k, a, COUNT);
1305 return _mm512_setzero_si512();
1314template <
size_t COUNT>
1315static SIMD_INLINE Vec<Word, 64> mask_slli(
const Vec<Word, 64> &src,
1316 const Mask<Word, 64> &k,
1317 const Vec<Word, 64> &a)
1319 SIMD_IF_CONSTEXPR (COUNT < 16) {
1320 return _mm512_mask_slli_epi16(src, k, a, COUNT);
1322 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1326template <
size_t COUNT>
1327static SIMD_INLINE Vec<Word, 64> maskz_slli(
const Mask<Word, 64> &k,
1328 const Vec<Word, 64> &a)
1330 SIMD_IF_CONSTEXPR (COUNT < 16) {
1331 return _mm512_maskz_slli_epi16(k, a, COUNT);
1333 return _mm512_setzero_si512();
1337template <
size_t COUNT>
1338static SIMD_INLINE Vec<Short, 64> mask_slli(
const Vec<Short, 64> &src,
1339 const Mask<Short, 64> &k,
1340 const Vec<Short, 64> &a)
1342 SIMD_IF_CONSTEXPR (COUNT < 16) {
1343 return _mm512_mask_slli_epi16(src, k, a, COUNT);
1345 return _mm512_mask_blend_epi16(k, src, _mm512_setzero_si512());
1349template <
size_t COUNT>
1350static SIMD_INLINE Vec<Short, 64> maskz_slli(
const Mask<Short, 64> &k,
1351 const Vec<Short, 64> &a)
1353 SIMD_IF_CONSTEXPR (COUNT < 16) {
1354 return _mm512_maskz_slli_epi16(k, a, COUNT);
1356 return _mm512_setzero_si512();
1362template <
size_t COUNT>
1363static SIMD_INLINE Vec<Int, 64> mask_slli(
const Vec<Int, 64> &src,
1364 const Mask<Int, 64> &k,
1365 const Vec<Int, 64> &a)
1367 SIMD_IF_CONSTEXPR (COUNT < 32) {
1368 return _mm512_mask_slli_epi32(src, k, a, COUNT);
1370 return _mm512_mask_blend_epi32(k, src, _mm512_setzero_si512());
1374template <
size_t COUNT>
1375static SIMD_INLINE Vec<Int, 64> maskz_slli(
const Mask<Int, 64> &k,
1376 const Vec<Int, 64> &a)
1378 SIMD_IF_CONSTEXPR (COUNT < 32) {
1379 return _mm512_maskz_slli_epi32(k, a, COUNT);
1381 return _mm512_setzero_si512();
1385template <
size_t COUNT>
1386static SIMD_INLINE Vec<Long, 64> mask_slli(
const Vec<Long, 64> &src,
1387 const Mask<Long, 64> &k,
1388 const Vec<Long, 64> &a)
1390 SIMD_IF_CONSTEXPR (COUNT < 64) {
1391 return _mm512_mask_slli_epi64(src, k, a, COUNT);
1393 return _mm512_mask_blend_epi64(k, src, _mm512_setzero_si512());
1397template <
size_t COUNT>
1398static SIMD_INLINE Vec<Long, 64> maskz_slli(
const Mask<Long, 64> &k,
1399 const Vec<Long, 64> &a)
1401 SIMD_IF_CONSTEXPR (COUNT < 64) {
1402 return _mm512_maskz_slli_epi64(k, a, COUNT);
1404 return _mm512_setzero_si512();
1418template <
typename T>
1419static SIMD_INLINE Vec<T, 64> mask_hadd(
const Vec<T, 64> &src,
1420 const Mask<T, 64> &k,
1421 const Vec<T, 64> &a,
1422 const Vec<T, 64> &b)
1425 unzip<1>(a, b, x, y);
1426 return internal::mask::mask_add(src, k, x, y);
1429template <
typename T>
1430static SIMD_INLINE Vec<T, 64> maskz_hadd(
const Mask<T, 64> &k,
1431 const Vec<T, 64> &a,
1432 const Vec<T, 64> &b)
1435 unzip<1>(a, b, x, y);
1436 return internal::mask::maskz_add(k, x, y);
1443template <
typename T>
1444static SIMD_INLINE Vec<T, 64> mask_hadds(
const Vec<T, 64> &src,
1445 const Mask<T, 64> &k,
1446 const Vec<T, 64> &a,
1447 const Vec<T, 64> &b)
1450 unzip<1>(a, b, x, y);
1451 return internal::mask::mask_adds(src, k, x, y);
1454template <
typename T>
1455static SIMD_INLINE Vec<T, 64> maskz_hadds(
const Mask<T, 64> &k,
1456 const Vec<T, 64> &a,
1457 const Vec<T, 64> &b)
1460 unzip<1>(a, b, x, y);
1461 return internal::mask::maskz_adds(k, x, y);
1468template <
typename T>
1469static SIMD_INLINE Vec<T, 64> mask_hsub(
const Vec<T, 64> &src,
1470 const Mask<T, 64> &k,
1471 const Vec<T, 64> &a,
1472 const Vec<T, 64> &b)
1475 unzip<1>(a, b, x, y);
1476 return internal::mask::mask_sub(src, k, x, y);
1479template <
typename T>
1480static SIMD_INLINE Vec<T, 64> maskz_hsub(
const Mask<T, 64> &k,
1481 const Vec<T, 64> &a,
1482 const Vec<T, 64> &b)
1485 unzip<1>(a, b, x, y);
1486 return internal::mask::maskz_sub(k, x, y);
1493template <
typename T>
1494static SIMD_INLINE Vec<T, 64> mask_hsubs(
const Vec<T, 64> &src,
1495 const Mask<T, 64> &k,
1496 const Vec<T, 64> &a,
1497 const Vec<T, 64> &b)
1500 unzip<1>(a, b, x, y);
1501 return internal::mask::mask_subs(src, k, x, y);
1504template <
typename T>
1505static SIMD_INLINE Vec<T, 64> maskz_hsubs(
const Mask<T, 64> &k,
1506 const Vec<T, 64> &a,
1507 const Vec<T, 64> &b)
1510 unzip<1>(a, b, x, y);
1511 return internal::mask::maskz_subs(k, x, y);
1517#define GENERATE_CMP(OP, TYPE, SUF) \
1518 static SIMD_INLINE Mask<TYPE, 64> mask_##OP( \
1519 const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, const Vec<TYPE, 64> &b) \
1521 return _mm512_mask_##OP##_##SUF##_mask(k, a, b); \
1523 static SIMD_INLINE Mask<TYPE, 64> mask_##OP(const Vec<TYPE, 64> &a, \
1524 const Vec<TYPE, 64> &b) \
1526 return _mm512_##OP##_##SUF##_mask(a, b); \
1529#define GENERATE_CMP_WITH_GENERALIZED_FCT(OP, TYPE, SUF, IMM8) \
1530 static SIMD_INLINE Mask<TYPE, 64> mask_##OP( \
1531 const Mask<TYPE, 64> &k, const Vec<TYPE, 64> &a, const Vec<TYPE, 64> &b) \
1533 return _mm512_mask_cmp_##SUF##_mask(k, a, b, IMM8); \
1535 static SIMD_INLINE Mask<TYPE, 64> mask_##OP(const Vec<TYPE, 64> &a, \
1536 const Vec<TYPE, 64> &b) \
1538 return _mm512_cmp_##SUF##_mask(a, b, IMM8); \
1546GENERATE_CMP(cmplt, Byte, epu8)
1547GENERATE_CMP(cmplt, SignedByte, epi8)
1548GENERATE_CMP(cmplt, Word, epu16)
1549GENERATE_CMP(cmplt, Short, epi16)
1551GENERATE_CMP(cmplt, Int, epi32)
1552GENERATE_CMP(cmplt, Long, epi64)
1554GENERATE_CMP_WITH_GENERALIZED_FCT(cmplt, Float, ps, _CMP_LT_OS)
1555GENERATE_CMP_WITH_GENERALIZED_FCT(cmplt, Double, pd, _CMP_LT_OS)
1562GENERATE_CMP(cmple, Byte, epu8)
1563GENERATE_CMP(cmple, SignedByte, epi8)
1564GENERATE_CMP(cmple, Word, epu16)
1565GENERATE_CMP(cmple, Short, epi16)
1567GENERATE_CMP(cmple, Int, epi32)
1568GENERATE_CMP(cmple, Long, epi64)
1570GENERATE_CMP_WITH_GENERALIZED_FCT(cmple, Float, ps, _CMP_LE_OS)
1571GENERATE_CMP_WITH_GENERALIZED_FCT(cmple, Double, pd, _CMP_LE_OS)
1578GENERATE_CMP(cmpeq, Byte, epu8)
1579GENERATE_CMP(cmpeq, SignedByte, epi8)
1580GENERATE_CMP(cmpeq, Word, epu16)
1581GENERATE_CMP(cmpeq, Short, epi16)
1583GENERATE_CMP(cmpeq, Int, epi32)
1584GENERATE_CMP(cmpeq, Long, epi64)
1586GENERATE_CMP_WITH_GENERALIZED_FCT(cmpeq, Float, ps, _CMP_EQ_OQ)
1587GENERATE_CMP_WITH_GENERALIZED_FCT(cmpeq, Double, pd, _CMP_EQ_OQ)
1594GENERATE_CMP(cmpgt, Byte, epu8)
1595GENERATE_CMP(cmpgt, SignedByte, epi8)
1596GENERATE_CMP(cmpgt, Word, epu16)
1597GENERATE_CMP(cmpgt, Short, epi16)
1599GENERATE_CMP(cmpgt, Int, epi32)
1600GENERATE_CMP(cmpgt, Long, epi64)
1602GENERATE_CMP_WITH_GENERALIZED_FCT(cmpgt, Float, ps, _CMP_GT_OS)
1603GENERATE_CMP_WITH_GENERALIZED_FCT(cmpgt, Double, pd, _CMP_GT_OS)
1610GENERATE_CMP(cmpge, Byte, epu8)
1611GENERATE_CMP(cmpge, SignedByte, epi8)
1612GENERATE_CMP(cmpge, Word, epu16)
1613GENERATE_CMP(cmpge, Short, epi16)
1615GENERATE_CMP(cmpge, Int, epi32)
1616GENERATE_CMP(cmpge, Long, epi64)
1618GENERATE_CMP_WITH_GENERALIZED_FCT(cmpge, Float, ps, _CMP_GE_OS)
1619GENERATE_CMP_WITH_GENERALIZED_FCT(cmpge, Double, pd, _CMP_GE_OS)
1626GENERATE_CMP(cmpneq, Byte, epu8)
1627GENERATE_CMP(cmpneq, SignedByte, epi8)
1628GENERATE_CMP(cmpneq, Word, epu16)
1629GENERATE_CMP(cmpneq, Short, epi16)
1631GENERATE_CMP(cmpneq, Int, epi32)
1632GENERATE_CMP(cmpneq, Long, epi64)
1634GENERATE_CMP_WITH_GENERALIZED_FCT(cmpneq, Float, ps, _CMP_NEQ_OQ)
1635GENERATE_CMP_WITH_GENERALIZED_FCT(cmpneq, Double, pd, _CMP_NEQ_OQ)
1642static SIMD_INLINE Vec<Byte, 64> mask_avg(
const Vec<Byte, 64> &src,
1643 const Mask<Byte, 64> &k,
1644 const Vec<Byte, 64> &a,
1645 const Vec<Byte, 64> &b)
1647 return _mm512_mask_avg_epu8(src, k, a, b);
1650static SIMD_INLINE Vec<Byte, 64> maskz_avg(
const Mask<Byte, 64> &k,
1651 const Vec<Byte, 64> &a,
1652 const Vec<Byte, 64> &b)
1654 return _mm512_maskz_avg_epu8(k, a, b);
1657static SIMD_INLINE Vec<Word, 64> mask_avg(
const Vec<Word, 64> &src,
1658 const Mask<Word, 64> &k,
1659 const Vec<Word, 64> &a,
1660 const Vec<Word, 64> &b)
1662 return _mm512_mask_avg_epu16(src, k, a, b);
1665static SIMD_INLINE Vec<Word, 64> maskz_avg(
const Mask<Word, 64> &k,
1666 const Vec<Word, 64> &a,
1667 const Vec<Word, 64> &b)
1669 return _mm512_maskz_avg_epu16(k, a, b);
1676template <
typename T,
1677 SIMD_ENABLE_IF(std::is_integral<T>::value &&std::is_signed<T>::value)>
1678static SIMD_INLINE Vec<T, 64> mask_avg(
const Vec<T, 64> &src,
1679 const Mask<T, 64> &k,
1680 const Vec<T, 64> &a,
const Vec<T, 64> &b)
1683 const auto lsb = bit_and(bit_or(a, b), one);
1684 const auto as = srai<1>(a);
1685 const auto bs = srai<1>(b);
1686 return internal::mask::mask_add(src, k, lsb, add(as, bs));
1689template <
typename T,
1690 SIMD_ENABLE_IF(std::is_integral<T>::value &&std::is_signed<T>::value)>
1691static SIMD_INLINE Vec<T, 64> maskz_avg(
const Mask<T, 64> &k,
1692 const Vec<T, 64> &a,
1693 const Vec<T, 64> &b)
1696 const auto lsb = bit_and(bit_or(a, b), one);
1697 const auto as = srai<1>(a);
1698 const auto bs = srai<1>(b);
1699 return internal::mask::maskz_add(k, lsb, add(as, bs));
1703static SIMD_INLINE Vec<Float, 64> mask_avg(
const Vec<Float, 64> &src,
1704 const Mask<Float, 64> &k,
1705 const Vec<Float, 64> &a,
1706 const Vec<Float, 64> &b)
1708 return _mm512_mask_mul_ps(src, k, _mm512_maskz_add_ps(k, a, b),
1709 _mm512_set1_ps(0.5f));
1713static SIMD_INLINE Vec<Float, 64> maskz_avg(
const Mask<Float, 64> &k,
1714 const Vec<Float, 64> &a,
1715 const Vec<Float, 64> &b)
1717 return _mm512_maskz_mul_ps(k, _mm512_maskz_add_ps(k, a, b),
1718 _mm512_set1_ps(0.5f));
1722static SIMD_INLINE Vec<Double, 64> mask_avg(
const Vec<Double, 64> &src,
1723 const Mask<Double, 64> &k,
1724 const Vec<Double, 64> &a,
1725 const Vec<Double, 64> &b)
1727 return _mm512_mask_mul_pd(src, k, _mm512_maskz_add_pd(k, a, b),
1728 _mm512_set1_pd(0.5));
1732static SIMD_INLINE Vec<Double, 64> maskz_avg(
const Mask<Double, 64> &k,
1733 const Vec<Double, 64> &a,
1734 const Vec<Double, 64> &b)
1736 return _mm512_maskz_mul_pd(k, _mm512_maskz_add_pd(k, a, b),
1737 _mm512_set1_pd(0.5));
1744#define TEST_ALL_ZEROS(TYPE, SUF) \
1745 static SIMD_INLINE bool mask_test_all_zeros(const Mask<TYPE, 64> &k, \
1746 const Vec<TYPE, 64> &a) \
1748 return (_mm512_mask_test_epi##SUF##_mask(k, a, a) == 0); \
1752TEST_ALL_ZEROS(Byte, 8)
1753TEST_ALL_ZEROS(SignedByte, 8)
1754TEST_ALL_ZEROS(Word, 16)
1755TEST_ALL_ZEROS(Short, 16)
1757TEST_ALL_ZEROS(Int, 32)
1758TEST_ALL_ZEROS(Long, 64)
1760static SIMD_INLINE
bool mask_test_all_zeros(const Mask<Float, 64> &k,
1761 const Vec<Float, 64> &a)
1763 return (_mm512_mask_test_epi32_mask(k, _mm512_castps_si512(a),
1764 _mm512_castps_si512(a)) == 0);
1767static SIMD_INLINE
bool mask_test_all_zeros(
const Mask<Double, 64> &k,
1768 const Vec<Double, 64> &a)
1770 return (_mm512_mask_test_epi64_mask(k, _mm512_castpd_si512(a),
1771 _mm512_castpd_si512(a)) == 0);
1784#define MASK_ALL_ONES(TYPE, MASK) \
1785 static SIMD_INLINE Mask<TYPE, 64> mask_all_ones(OutputType<TYPE>, \
1792MASK_ALL_ONES(Byte, 0xFFFFFFFFFFFFFFFF)
1793MASK_ALL_ONES(SignedByte, 0xFFFFFFFFFFFFFFFF)
1794MASK_ALL_ONES(Word, 0xFFFFFFFF)
1795MASK_ALL_ONES(Short, 0xFFFFFFFF)
1797MASK_ALL_ONES(Int, 0xFFFF)
1798MASK_ALL_ONES(Float, 0xFFFF)
1799MASK_ALL_ONES(Long, 0xFF)
1800MASK_ALL_ONES(Double, 0xFF)
1816#define GENERATE_DMASKOP(NAME, TYPE, NUM) \
1817 static SIMD_INLINE Mask<TYPE, 64> k##NAME(const Mask<TYPE, 64> &a, \
1818 const Mask<TYPE, 64> &b) \
1820 return _k##NAME##_mask##NUM(a, b); \
1823#define KNOT(TYPE, NUM) \
1824 static SIMD_INLINE Mask<TYPE, 64> knot(const Mask<TYPE, 64> &a) \
1826 return _knot_mask##NUM(a); \
1830#define KSHIFT(R_OR_L, TYPE, NUM) \
1831 template <size_t COUNT> \
1832 static SIMD_INLINE Mask<TYPE, 64> kshift##R_OR_L##i(const Mask<TYPE, 64> &a) \
1834 return _kshift##R_OR_L##i_mask##NUM(a, COUNT); \
1837GENERATE_DMASKOP(and, Byte, 64)
1838GENERATE_DMASKOP(and, SignedByte, 64)
1839GENERATE_DMASKOP(and, Word, 32)
1840GENERATE_DMASKOP(and, Short, 32)
1842GENERATE_DMASKOP(andn, Byte, 64)
1843GENERATE_DMASKOP(andn, SignedByte, 64)
1844GENERATE_DMASKOP(andn, Word, 32)
1845GENERATE_DMASKOP(andn, Short, 32)
1847GENERATE_DMASKOP(or, Byte, 64)
1848GENERATE_DMASKOP(or, SignedByte, 64)
1849GENERATE_DMASKOP(or, Word, 32)
1850GENERATE_DMASKOP(or, Short, 32)
1852GENERATE_DMASKOP(xor, Byte, 64)
1853GENERATE_DMASKOP(xor, SignedByte, 64)
1854GENERATE_DMASKOP(xor, Word, 32)
1855GENERATE_DMASKOP(xor, Short, 32)
1857GENERATE_DMASKOP(xnor, Byte, 64)
1858GENERATE_DMASKOP(xnor, SignedByte, 64)
1859GENERATE_DMASKOP(xnor, Word, 32)
1860GENERATE_DMASKOP(xnor, Short, 32)
1862GENERATE_DMASKOP(add, Byte, 64)
1863GENERATE_DMASKOP(add, SignedByte, 64)
1864GENERATE_DMASKOP(add, Word, 32)
1865GENERATE_DMASKOP(add, Short, 32)
1873KSHIFT(r, SignedByte, 64)
1877KSHIFT(l, SignedByte, 64)
1883GENERATE_DMASKOP(and, Int, 16)
1884GENERATE_DMASKOP(and, Float, 16)
1885GENERATE_DMASKOP(and, Long, 8)
1886GENERATE_DMASKOP(and, Double, 8)
1888GENERATE_DMASKOP(andn, Int, 16)
1889GENERATE_DMASKOP(andn, Float, 16)
1890GENERATE_DMASKOP(andn, Long, 8)
1891GENERATE_DMASKOP(andn, Double, 8)
1893GENERATE_DMASKOP(or, Int, 16)
1894GENERATE_DMASKOP(or, Float, 16)
1895GENERATE_DMASKOP(or, Long, 8)
1896GENERATE_DMASKOP(or, Double, 8)
1898GENERATE_DMASKOP(xor, Int, 16)
1899GENERATE_DMASKOP(xor, Float, 16)
1900GENERATE_DMASKOP(xor, Long, 8)
1901GENERATE_DMASKOP(xor, Double, 8)
1903GENERATE_DMASKOP(xnor, Int, 16)
1904GENERATE_DMASKOP(xnor, Float, 16)
1905GENERATE_DMASKOP(xnor, Long, 8)
1906GENERATE_DMASKOP(xnor, Double, 8)
1910GENERATE_DMASKOP(add, Int, 16)
1911GENERATE_DMASKOP(add, Float, 16)
1912GENERATE_DMASKOP(add, Long, 8)
1913GENERATE_DMASKOP(add, Double, 8)
1935#define GENERATE_DMASKOP(NAME, TYPE, NUM) \
1936 static SIMD_INLINE Mask<TYPE, 64> k##NAME(const Mask<TYPE, 64> &a, \
1937 const Mask<TYPE, 64> &b) \
1939 return _mm512_k##NAME(a, b); \
1942#define KNOT(TYPE, NUM) \
1943 static SIMD_INLINE Mask<TYPE, 64> knot(const Mask<TYPE, 64> &a) \
1945 return _mm512_knot(a); \
1947GENERATE_DMASKOP(and, Int, 16)
1948GENERATE_DMASKOP(and, Float, 16)
1949GENERATE_DMASKOP(and, Long, 8)
1950GENERATE_DMASKOP(and, Double, 8)
1952GENERATE_DMASKOP(andn, Int, 16)
1953GENERATE_DMASKOP(andn, Float, 16)
1954GENERATE_DMASKOP(andn, Long, 8)
1955GENERATE_DMASKOP(andn, Double, 8)
1957GENERATE_DMASKOP(or, Int, 16)
1958GENERATE_DMASKOP(or, Float, 16)
1959GENERATE_DMASKOP(or, Long, 8)
1960GENERATE_DMASKOP(or, Double, 8)
1962GENERATE_DMASKOP(xor, Int, 16)
1963GENERATE_DMASKOP(xor, Float, 16)
1964GENERATE_DMASKOP(xor, Long, 8)
1965GENERATE_DMASKOP(xor, Double, 8)
1967GENERATE_DMASKOP(xnor, Int, 16)
1968GENERATE_DMASKOP(xnor, Float, 16)
1969GENERATE_DMASKOP(xnor, Long, 8)
1970GENERATE_DMASKOP(xnor, Double, 8)
1978template <
typename T>
1979static SIMD_INLINE Mask<T, 64> kand(
const Mask<T, 64> &a,
const Mask<T, 64> &b)
1984template <
typename T>
1985static SIMD_INLINE Mask<T, 64> kandn(
const Mask<T, 64> &a,
const Mask<T, 64> &b)
1990template <
typename T>
1991static SIMD_INLINE Mask<T, 64> kor(
const Mask<T, 64> &a,
const Mask<T, 64> &b)
1996template <
typename T>
1997static SIMD_INLINE Mask<T, 64> kxor(
const Mask<T, 64> &a,
const Mask<T, 64> &b)
2002template <
typename T>
2003static SIMD_INLINE Mask<T, 64> kxnor(
const Mask<T, 64> &a,
const Mask<T, 64> &b)
2008template <
typename T>
2009static SIMD_INLINE Mask<T, 64> kadd(
const Mask<T, 64> &a,
const Mask<T, 64> &b)
2014template <
typename T>
2015static SIMD_INLINE Mask<T, 64> knot(
const Mask<T, 64> &a)
2020template <
size_t COUNT,
typename T>
2021static SIMD_INLINE Mask<T, 64> kshiftri(
const Mask<T, 64> &a)
2029 if (COUNT >= 64) {
return 0; }
2031#pragma GCC diagnostic push
2032#pragma GCC diagnostic ignored "-Wshift-count-overflow"
2033 return ((uint64_t) a) >> ((uint64_t) COUNT);
2034#pragma GCC diagnostic pop
2037template <
size_t COUNT,
typename T>
2038static SIMD_INLINE Mask<T, 64> kshiftli(
const Mask<T, 64> &a)
2046 if (COUNT >= 64) {
return 0; }
2048#pragma GCC diagnostic push
2049#pragma GCC diagnostic ignored "-Wshift-count-overflow"
2050 return ((uint64_t) a) << ((uint64_t) COUNT);
2051#pragma GCC diagnostic pop
2072template <
typename T>
2073static SIMD_INLINE Mask<T, 64> kshiftli(
const Mask<T, 64> &a, uint64_t count)
2079 if (count >= 64) {
return Mask<T, 64>(0); }
2080 return Mask<T, 64>(((uint64_t) a) << count);
2083template <
typename T>
2084static SIMD_INLINE Mask<T, 64> kshiftri(
const Mask<T, 64> &a, uint64_t count)
2090 if (count >= 64) {
return Mask<T, 64>(0); }
2091 return Mask<T, 64>(((uint64_t) a) >> count);
2096template <
typename T>
2097static SIMD_INLINE
bool mask_test_all_zeros(
const Mask<T, 64> &a)
2102template <
typename T>
2103static SIMD_INLINE
bool mask_test_all_ones(
const Mask<T, 64> &a)
2105 return a == mask_all_ones(OutputType<T>(), Integer<64>());
2110template <
typename T>
2111static SIMD_INLINE Mask<T, 64> kcmpeq(
const Mask<T, 64> &a,
2112 const Mask<T, 64> &b)
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
static Vec< T, SIMD_WIDTH > setzero()
Returns a Vec with all elements set to zero.
Definition base.H:70
static Vec< T, SIMD_WIDTH > set1(const dont_deduce< T > a)
Returns a Vec with all elements set to the same value.
Definition base.H:88
Namespace for T-SIMD.
Definition time_measurement.H:161