37#ifndef SIMD_VEC_BASE_IMPL_INTEL_64_H_
38#define SIMD_VEC_BASE_IMPL_INTEL_64_H_
44#include "base_impl_intel16.H"
45#include "base_impl_intel32.H"
46#include "intrins_intel.H"
53#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_64_AVAIL_) && \
54 !defined(SIMDVEC_SANDBOX)
78 __m512i zmm = _mm512_setzero_si512();
82 static constexpr size_t elements = 64 /
sizeof(T);
84 static constexpr size_t bytes = 64;
87 Vec(
const __m512i &x) { zmm = x; }
88 Vec &operator=(
const __m512i &x)
93 operator __m512i()
const {
return zmm; }
95 Vec(
const Vec<T, 32> &lo,
const Vec<T, 32> &hi)
97 zmm = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
99 SIMD_INLINE Vec<T, 32> lo()
const {
return _mm512_castsi512_si256(zmm); }
100 SIMD_INLINE Vec<T, 32> hi()
const
102 return _mm512_extracti64x4_epi64(zmm, 1);
122 __m512 zmm = _mm512_setzero_ps();
128 static constexpr size_t bytes = 64;
131 Vec(
const __m512 &x) { zmm = x; }
132 Vec &operator=(
const __m512 &x)
137 operator __m512()
const {
return zmm; }
139 Vec(
const Vec<Float, 32> &lo,
const Vec<Float, 32> &hi)
141 zmm = _mm512_castpd_ps(_mm512_insertf64x4(
142 _mm512_castps_pd(_mm512_castps256_ps512(lo)), _mm256_castps_pd(hi), 1));
144 SIMD_INLINE Vec<Float, 32> lo()
const {
return _mm512_castps512_ps256(zmm); }
146 SIMD_INLINE Vec<Float, 32> hi()
const
148 return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm), 1));
174 static constexpr size_t bytes = 64;
177 Vec(
const __m512d &x) { zmm = x; }
178 Vec &operator=(
const __m512d &x)
183 operator __m512d()
const {
return zmm; }
185 Vec(
const Vec<Double, 32> &lo,
const Vec<Double, 32> &hi)
187 zmm = _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1);
189 SIMD_INLINE Vec<Double, 32> lo()
const {
return _mm512_castpd512_pd256(zmm); }
190 SIMD_INLINE Vec<Double, 32> hi()
const
192 return _mm512_extractf64x4_pd(zmm, 1);
225template <
size_t COUNT>
226static SIMD_INLINE __m512i x_mm512_alignr_epi8(__m512i h, __m512i l)
228 static_assert(COUNT < 32,
"");
230 return _mm512_alignr_epi8(h, l, COUNT);
234 const __m256i lo = _mm256_alignr_epi8(_mm512_castsi512_si256(h),
235 _mm512_castsi512_si256(l), COUNT);
236 const __m256i hi = _mm256_alignr_epi8(_mm512_extracti64x4_epi64(h, 1),
237 _mm512_extracti64x4_epi64(l, 1), COUNT);
238 return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
246static SIMD_INLINE __m512i x_mm512_transpose8x64_epi64(__m512i a)
248 return _mm512_permutexvar_epi64(_mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0), a);
255static SIMD_INLINE __m512i x_mm512_evenodd8x64_epi64(__m512i a)
257 return _mm512_permutexvar_epi64(_mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0), a);
266#define SIMD_X_BW_INT_BINFCT_64(INTRIN) \
267 static SIMD_INLINE __m512i x_mm512_##INTRIN(__m512i a, __m512i b) \
269 return _mm512_##INTRIN(a, b); \
273#define SIMD_X_BW_INT_BINFCT_64(INTRIN) \
274 static SIMD_INLINE __m512i x_mm512_##INTRIN(__m512i a, __m512i b) \
277 _mm256_##INTRIN(_mm512_castsi512_si256(a), _mm512_castsi512_si256(b)); \
278 const __m256i hi = _mm256_##INTRIN(_mm512_extracti64x4_epi64(a, 1), \
279 _mm512_extracti64x4_epi64(b, 1)); \
280 return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); \
284SIMD_X_BW_INT_BINFCT_64(unpacklo_epi8)
285SIMD_X_BW_INT_BINFCT_64(unpackhi_epi8)
286SIMD_X_BW_INT_BINFCT_64(unpacklo_epi16)
287SIMD_X_BW_INT_BINFCT_64(unpackhi_epi16)
288SIMD_X_BW_INT_BINFCT_64(shuffle_epi8)
289SIMD_X_BW_INT_BINFCT_64(packs_epi16)
290SIMD_X_BW_INT_BINFCT_64(packs_epi32)
291SIMD_X_BW_INT_BINFCT_64(packus_epi16)
292SIMD_X_BW_INT_BINFCT_64(packus_epi32)
305static SIMD_INLINE __m512i x_mm512_movm_epi32(__mmask16 k)
308 return _mm512_movm_epi32(k);
310 return _mm512_maskz_mov_epi32(k, _mm512_set1_epi32(-1));
318static SIMD_INLINE __m512i x_mm512_movm_epi64(__mmask8 k)
321 return _mm512_movm_epi64(k);
323 return _mm512_maskz_mov_epi64(k, _mm512_set1_epi64(-1));
342template <
typename Tdst,
typename Tsrc,
343 SIMD_ENABLE_IF((!std::is_same<Tdst, Tsrc>::value &&
344 std::is_integral<Tdst>::value &&
345 std::is_integral<Tsrc>::value))>
346static SIMD_INLINE Vec<Tdst, 64> reinterpret(
const Vec<Tsrc, 64> &vec,
352 return Vec<Tdst, 64>(__m512i(vec));
356template <
typename Tdst, SIMD_ENABLE_IF((std::is_
integral<Tdst>::value))>
357static SIMD_INLINE Vec<Tdst, 64> reinterpret(
const Vec<Float, 64> &vec,
360 return _mm512_castps_si512(vec);
364template <
typename Tsrc, SIMD_ENABLE_IF((std::is_
integral<Tsrc>::value))>
365static SIMD_INLINE Vec<Float, 64> reinterpret(
const Vec<Tsrc, 64> &vec,
368 return _mm512_castsi512_ps(vec);
372template <
typename Tdst, SIMD_ENABLE_IF((std::is_
integral<Tdst>::value))>
373static SIMD_INLINE Vec<Tdst, 64> reinterpret(
const Vec<Double, 64> &vec,
376 return _mm512_castpd_si512(vec);
380template <
typename Tsrc, SIMD_ENABLE_IF((std::is_
integral<Tsrc>::value))>
381static SIMD_INLINE Vec<Double, 64> reinterpret(
const Vec<Tsrc, 64> &vec,
384 return _mm512_castsi512_pd(vec);
388static SIMD_INLINE Vec<Double, 64> reinterpret(
const Vec<Float, 64> &vec,
391 return _mm512_castps_pd(vec);
395static SIMD_INLINE Vec<Float, 64> reinterpret(
const Vec<Double, 64> &vec,
398 return _mm512_castpd_ps(vec);
403static SIMD_INLINE Vec<T, 64> reinterpret(
const Vec<T, 64> &vec, OutputType<T>)
419static SIMD_INLINE Vec<Int, 64> cvts(
const Vec<Float, 64> &a, OutputType<Int>)
424 __m512 clip = _mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
425 return _mm512_cvtps_epi32(_mm512_min_ps(clip, a));
429static SIMD_INLINE Vec<Float, 64> cvts(
const Vec<Int, 64> &a, OutputType<Float>)
431 return _mm512_cvtepi32_ps(a);
434static SIMD_INLINE Vec<Long, 64> cvts(
const Vec<Double, 64> &a,
437 const auto clip = _mm512_set1_pd(MAX_POS_DOUBLE_CONVERTIBLE_TO_INT64);
438 const auto clipped = _mm512_min_pd(clip, a);
440 return _mm512_cvtpd_epi64(clipped);
446 Double tmpD[8] SIMD_ATTR_ALIGNED(64);
447 _mm512_store_pd(tmpD, clipped);
448 Long tmpL[8] SIMD_ATTR_ALIGNED(64);
449 for (
size_t i = 0; i < 8; ++i) {
450 tmpL[i] =
static_cast<Long>(std::rint(tmpD[i]));
452 return _mm512_load_si512((__m512i *) tmpL);
456static SIMD_INLINE Vec<Double, 64> cvts(
const Vec<Long, 64> &a,
460 return _mm512_cvtepi64_pd(a);
464 __m512i xH = _mm512_srai_epi32(a, 16);
465 xH = _mm512_and_si512(xH, _mm512_set1_epi32(0xffff0000));
466 xH = _mm512_add_epi64(
467 xH, _mm512_castpd_si512(_mm512_set1_pd(442721857769029238784.)));
468 __m512i xL = _mm512_or_si512(
469 _mm512_and_si512(a, _mm512_set1_epi64(0x0000ffffffffffff)),
470 _mm512_castpd_si512(_mm512_set1_pd(0x0010000000000000)));
472 _mm512_sub_pd(_mm512_castsi512_pd(xH),
473 _mm512_set1_pd(442726361368656609280.));
474 return _mm512_add_pd(f, _mm512_castsi512_pd(xL));
479 Long tmpL[8] SIMD_ATTR_ALIGNED(64);
480 _mm512_store_si512((__m512i *) tmpL, a);
481 Double tmpD[8] SIMD_ATTR_ALIGNED(64);
482 for (
size_t i = 0; i < 8; ++i) { tmpD[i] =
static_cast<Double>(tmpL[i]); }
483 return _mm512_load_pd(tmpD);
492template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
493static SIMD_INLINE Vec<T, 64> setzero(OutputType<T>, Integer<64>)
495 return _mm512_setzero_si512();
498static SIMD_INLINE Vec<Float, 64> setzero(OutputType<Float>, Integer<64>)
500 return _mm512_setzero_ps();
503static SIMD_INLINE Vec<Double, 64> setzero(OutputType<Double>, Integer<64>)
505 return _mm512_setzero_pd();
512static SIMD_INLINE Vec<Byte, 64> set1(Byte a, Integer<64>)
514 return _mm512_set1_epi8(a);
517static SIMD_INLINE Vec<SignedByte, 64> set1(SignedByte a, Integer<64>)
519 return _mm512_set1_epi8(a);
522static SIMD_INLINE Vec<Word, 64> set1(Word a, Integer<64>)
524 return _mm512_set1_epi16(a);
527static SIMD_INLINE Vec<Short, 64> set1(Short a, Integer<64>)
529 return _mm512_set1_epi16(a);
532static SIMD_INLINE Vec<Int, 64> set1(Int a, Integer<64>)
534 return _mm512_set1_epi32(a);
537static SIMD_INLINE Vec<Long, 64> set1(Long a, Integer<64>)
539 return _mm512_set1_epi64(a);
542static SIMD_INLINE Vec<Float, 64> set1(Float a, Integer<64>)
544 return _mm512_set1_ps(a);
547static SIMD_INLINE Vec<Double, 64> set1(Double a, Integer<64>)
549 return _mm512_set1_pd(a);
557static SIMD_INLINE Vec<T, 64> load(
const T *
const p, Integer<64>)
561 SIMD_CHECK_ALIGNMENT(p, 64);
562 return _mm512_load_si512((__m512i *) p);
565static SIMD_INLINE Vec<Float, 64> load(
const Float *
const p, Integer<64>)
569 SIMD_CHECK_ALIGNMENT(p, 64);
570 return _mm512_load_ps(p);
573static SIMD_INLINE Vec<Double, 64> load(
const Double *
const p, Integer<64>)
577 SIMD_CHECK_ALIGNMENT(p, 64);
578 return _mm512_load_pd(p);
586static SIMD_INLINE Vec<T, 64> loadu(
const T *
const p, Integer<64>)
588 return _mm512_loadu_si512((__m512i *) p);
591static SIMD_INLINE Vec<Float, 64> loadu(
const Float *
const p, Integer<64>)
593 return _mm512_loadu_ps(p);
596static SIMD_INLINE Vec<Double, 64> loadu(
const Double *
const p, Integer<64>)
598 return _mm512_loadu_pd(p);
607static SIMD_INLINE
void store(T *
const p,
const Vec<T, 64> &a)
611 SIMD_CHECK_ALIGNMENT(p, 64);
612 _mm512_store_si512((__m512i *) p, a);
616static SIMD_INLINE
void store(Float *
const p,
const Vec<Float, 64> &a)
620 SIMD_CHECK_ALIGNMENT(p, 64);
621 _mm512_store_ps(p, a);
625static SIMD_INLINE
void store(Double *
const p,
const Vec<Double, 64> &a)
629 SIMD_CHECK_ALIGNMENT(p, 64);
630 _mm512_store_pd(p, a);
639static SIMD_INLINE
void storeu(T *
const p,
const Vec<T, 64> &a)
641 _mm512_storeu_si512((__m512i *) p, a);
645static SIMD_INLINE
void storeu(Float *
const p,
const Vec<Float, 64> &a)
647 _mm512_storeu_ps(p, a);
651static SIMD_INLINE
void storeu(Double *
const p,
const Vec<Double, 64> &a)
653 _mm512_storeu_pd(p, a);
662static SIMD_INLINE
void stream_store(T *
const p,
const Vec<T, 64> &a)
666 SIMD_CHECK_ALIGNMENT(p, 64);
667 _mm512_stream_si512((__m512i *) p, a);
671static SIMD_INLINE
void stream_store(Float *
const p,
const Vec<Float, 64> &a)
675 SIMD_CHECK_ALIGNMENT(p, 64);
676 _mm512_stream_ps(p, a);
680static SIMD_INLINE
void stream_store(Double *
const p,
const Vec<Double, 64> &a)
684 SIMD_CHECK_ALIGNMENT(p, 64);
685 _mm512_stream_pd(p, a);
692template <
size_t COUNT>
693static SIMD_INLINE
Byte extract(
const Vec<Byte, 64> &a)
695 SIMD_IF_CONSTEXPR (COUNT < 64) {
696 return _mm_extract_epi8(_mm512_extracti32x4_epi32(a, COUNT >> 4),
703template <
size_t COUNT>
704static SIMD_INLINE
SignedByte extract(
const Vec<SignedByte, 64> &a)
706 SIMD_IF_CONSTEXPR (COUNT < 64) {
707 return _mm_extract_epi8(_mm512_extracti32x4_epi32(a, COUNT >> 4),
714template <
size_t COUNT>
715static SIMD_INLINE
Word extract(
const Vec<Word, 64> &a)
717 SIMD_IF_CONSTEXPR (COUNT < 32) {
718 return _mm_extract_epi16(_mm512_extracti32x4_epi32(a, COUNT >> 3),
725template <
size_t COUNT>
726static SIMD_INLINE
Short extract(
const Vec<Short, 64> &a)
728 SIMD_IF_CONSTEXPR (COUNT < 32) {
729 return _mm_extract_epi16(_mm512_extracti32x4_epi32(a, COUNT >> 3),
736template <
size_t COUNT>
737static SIMD_INLINE
Int extract(
const Vec<Int, 64> &a)
739 SIMD_IF_CONSTEXPR (COUNT < 16) {
740 return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, COUNT >> 2),
747template <
size_t COUNT>
748static SIMD_INLINE
Long extract(
const Vec<Long, 64> &a)
750 SIMD_IF_CONSTEXPR (COUNT < 8) {
751 return _mm_extract_epi64(_mm512_extracti32x4_epi32(a, COUNT >> 1),
758template <
size_t COUNT>
759static SIMD_INLINE
Float extract(
const Vec<Float, 64> &a)
761 SIMD_IF_CONSTEXPR (COUNT < 16) {
762 return ::simd::internal::bit_cast<Float>(
763 _mm_extract_ps(_mm512_extractf32x4_ps(a, COUNT >> 2), COUNT % 4));
769template <
size_t COUNT>
770static SIMD_INLINE
Double extract(
const Vec<Double, 64> &a)
772 SIMD_IF_CONSTEXPR (COUNT < 8) {
773 return ::simd::internal::bit_cast<Double>(_mm_extract_epi64(
774 _mm512_extracti32x4_epi32(_mm512_castpd_si512(a), COUNT >> 1),
788template <
size_t LANE_INDEX,
typename T>
789static SIMD_INLINE Vec<T, 16> extractLane(
const Vec<T, 64> &a)
791 const auto intA = reinterpret(a, OutputType<Int>());
792 const Vec<Int, 16> intRes = _mm512_extracti32x4_epi32(intA, LANE_INDEX);
793 return reinterpret(intRes, OutputType<T>());
802static SIMD_INLINE Vec<Byte, 64> add(
const Vec<Byte, 64> &a,
803 const Vec<Byte, 64> &b)
805 return _mm512_add_epi8(a, b);
808static SIMD_INLINE Vec<SignedByte, 64> add(
const Vec<SignedByte, 64> &a,
809 const Vec<SignedByte, 64> &b)
811 return _mm512_add_epi8(a, b);
814static SIMD_INLINE Vec<Word, 64> add(
const Vec<Word, 64> &a,
815 const Vec<Word, 64> &b)
817 return _mm512_add_epi16(a, b);
820static SIMD_INLINE Vec<Short, 64> add(
const Vec<Short, 64> &a,
821 const Vec<Short, 64> &b)
823 return _mm512_add_epi16(a, b);
830static SIMD_INLINE Vec<T, 64> add(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
832 return Vec<T, 64>(add(a.lo(), b.lo()), add(a.hi(), b.hi()));
837static SIMD_INLINE Vec<Int, 64> add(
const Vec<Int, 64> &a,
838 const Vec<Int, 64> &b)
840 return _mm512_add_epi32(a, b);
843static SIMD_INLINE Vec<Long, 64> add(
const Vec<Long, 64> &a,
844 const Vec<Long, 64> &b)
846 return _mm512_add_epi64(a, b);
849static SIMD_INLINE Vec<Float, 64> add(
const Vec<Float, 64> &a,
850 const Vec<Float, 64> &b)
852 return _mm512_add_ps(a, b);
855static SIMD_INLINE Vec<Double, 64> add(
const Vec<Double, 64> &a,
856 const Vec<Double, 64> &b)
858 return _mm512_add_pd(a, b);
867static SIMD_INLINE Vec<Byte, 64> adds(
const Vec<Byte, 64> &a,
868 const Vec<Byte, 64> &b)
870 return _mm512_adds_epu8(a, b);
873static SIMD_INLINE Vec<SignedByte, 64> adds(
const Vec<SignedByte, 64> &a,
874 const Vec<SignedByte, 64> &b)
876 return _mm512_adds_epi8(a, b);
879static SIMD_INLINE Vec<Word, 64> adds(
const Vec<Word, 64> &a,
880 const Vec<Word, 64> &b)
882 return _mm512_adds_epu16(a, b);
885static SIMD_INLINE Vec<Short, 64> adds(
const Vec<Short, 64> &a,
886 const Vec<Short, 64> &b)
888 return _mm512_adds_epi16(a, b);
895static SIMD_INLINE Vec<T, 64> adds(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
897 return Vec<T, 64>(adds(a.lo(), b.lo()), adds(a.hi(), b.hi()));
902static SIMD_INLINE Vec<Int, 64> adds(
const Vec<Int, 64> &a,
903 const Vec<Int, 64> &b)
912 const __m512i sum = _mm512_add_epi32(a, b);
913 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
914 const __m512i sumHasDiffSign = _mm512_xor_si512(a, sum);
916 const __m512i overflow =
917 _mm512_srai_epi32(_mm512_andnot_si512(opsHaveDiffSign, sumHasDiffSign), 31);
921 const __m512i saturatedSum =
922 _mm512_xor_si512(_mm512_srai_epi32(a, 31), _mm512_set1_epi32(0x7FFFFFFF));
924 return _mm512_or_si512(_mm512_andnot_si512(overflow, sum),
925 _mm512_and_si512(overflow, saturatedSum));
928static SIMD_INLINE Vec<Long, 64> adds(
const Vec<Long, 64> &a,
929 const Vec<Long, 64> &b)
935 const __m512i sum = _mm512_add_epi64(a, b);
936 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
937 const __m512i sumHasDiffSign = _mm512_xor_si512(a, sum);
939 const __m512i overflow =
940 _mm512_srai_epi64(_mm512_andnot_si512(opsHaveDiffSign, sumHasDiffSign), 63);
944 const __m512i saturatedSum = _mm512_xor_si512(
945 _mm512_srai_epi64(a, 63), _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
947 return _mm512_or_si512(_mm512_andnot_si512(overflow, sum),
948 _mm512_and_si512(overflow, saturatedSum));
952static SIMD_INLINE Vec<Float, 64> adds(
const Vec<Float, 64> &a,
953 const Vec<Float, 64> &b)
955 return _mm512_add_ps(a, b);
959static SIMD_INLINE Vec<Double, 64> adds(
const Vec<Double, 64> &a,
960 const Vec<Double, 64> &b)
962 return _mm512_add_pd(a, b);
971static SIMD_INLINE Vec<Byte, 64> sub(
const Vec<Byte, 64> &a,
972 const Vec<Byte, 64> &b)
974 return _mm512_sub_epi8(a, b);
977static SIMD_INLINE Vec<SignedByte, 64> sub(
const Vec<SignedByte, 64> &a,
978 const Vec<SignedByte, 64> &b)
980 return _mm512_sub_epi8(a, b);
983static SIMD_INLINE Vec<Word, 64> sub(
const Vec<Word, 64> &a,
984 const Vec<Word, 64> &b)
986 return _mm512_sub_epi16(a, b);
989static SIMD_INLINE Vec<Short, 64> sub(
const Vec<Short, 64> &a,
990 const Vec<Short, 64> &b)
992 return _mm512_sub_epi16(a, b);
999static SIMD_INLINE Vec<T, 64> sub(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
1001 return Vec<T, 64>(sub(a.lo(), b.lo()), sub(a.hi(), b.hi()));
1006static SIMD_INLINE Vec<Int, 64> sub(
const Vec<Int, 64> &a,
1007 const Vec<Int, 64> &b)
1009 return _mm512_sub_epi32(a, b);
1012static SIMD_INLINE Vec<Long, 64> sub(
const Vec<Long, 64> &a,
1013 const Vec<Long, 64> &b)
1015 return _mm512_sub_epi64(a, b);
1018static SIMD_INLINE Vec<Float, 64> sub(
const Vec<Float, 64> &a,
1019 const Vec<Float, 64> &b)
1021 return _mm512_sub_ps(a, b);
1024static SIMD_INLINE Vec<Double, 64> sub(
const Vec<Double, 64> &a,
1025 const Vec<Double, 64> &b)
1027 return _mm512_sub_pd(a, b);
1036static SIMD_INLINE Vec<Byte, 64> subs(
const Vec<Byte, 64> &a,
1037 const Vec<Byte, 64> &b)
1039 return _mm512_subs_epu8(a, b);
1042static SIMD_INLINE Vec<SignedByte, 64> subs(
const Vec<SignedByte, 64> &a,
1043 const Vec<SignedByte, 64> &b)
1045 return _mm512_subs_epi8(a, b);
1048static SIMD_INLINE Vec<Word, 64> subs(
const Vec<Word, 64> &a,
1049 const Vec<Word, 64> &b)
1051 return _mm512_subs_epu16(a, b);
1054static SIMD_INLINE Vec<Short, 64> subs(
const Vec<Short, 64> &a,
1055 const Vec<Short, 64> &b)
1057 return _mm512_subs_epi16(a, b);
1063template <
typename T>
1064static SIMD_INLINE Vec<T, 64> subs(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
1066 return Vec<T, 64>(subs(a.lo(), b.lo()), subs(a.hi(), b.hi()));
1071static SIMD_INLINE Vec<Int, 64> subs(
const Vec<Int, 64> &a,
1072 const Vec<Int, 64> &b)
1081 const __m512i diff = _mm512_sub_epi32(a, b);
1082 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
1083 const __m512i diffHasDiffSign = _mm512_xor_si512(a, diff);
1085 const __m512i overflow =
1086 _mm512_srai_epi32(_mm512_and_si512(opsHaveDiffSign, diffHasDiffSign), 31);
1090 const __m512i saturatedDiff =
1091 _mm512_xor_si512(_mm512_srai_epi32(a, 31), _mm512_set1_epi32(0x7FFFFFFF));
1093 return _mm512_or_si512(_mm512_andnot_si512(overflow, diff),
1094 _mm512_and_si512(overflow, saturatedDiff));
1097static SIMD_INLINE Vec<Long, 64> subs(
const Vec<Long, 64> &a,
1098 const Vec<Long, 64> &b)
1104 const __m512i diff = _mm512_sub_epi64(a, b);
1105 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
1106 const __m512i diffHasDiffSign = _mm512_xor_si512(a, diff);
1108 const __m512i overflow =
1109 _mm512_srai_epi64(_mm512_and_si512(opsHaveDiffSign, diffHasDiffSign), 63);
1113 const __m512i saturatedDiff = _mm512_xor_si512(
1114 _mm512_srai_epi64(a, 63), _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
1116 return _mm512_or_si512(_mm512_andnot_si512(overflow, diff),
1117 _mm512_and_si512(overflow, saturatedDiff));
1121static SIMD_INLINE Vec<Float, 64> subs(
const Vec<Float, 64> &a,
1122 const Vec<Float, 64> &b)
1124 return _mm512_sub_ps(a, b);
1128static SIMD_INLINE Vec<Double, 64> subs(
const Vec<Double, 64> &a,
1129 const Vec<Double, 64> &b)
1131 return _mm512_sub_pd(a, b);
1140static SIMD_INLINE Vec<SignedByte, 64> neg(
const Vec<SignedByte, 64> &a)
1142 return _mm512_sub_epi8(_mm512_setzero_si512(), a);
1145static SIMD_INLINE Vec<Short, 64> neg(
const Vec<Short, 64> &a)
1147 return _mm512_sub_epi16(_mm512_setzero_si512(), a);
1153template <
typename T>
1154static SIMD_INLINE Vec<T, 64> neg(
const Vec<T, 64> &a)
1156 return Vec<T, 64>(neg(a.lo()), neg(a.hi()));
1161static SIMD_INLINE Vec<Int, 64> neg(
const Vec<Int, 64> &a)
1163 return _mm512_sub_epi32(_mm512_setzero_si512(), a);
1166static SIMD_INLINE Vec<Long, 64> neg(
const Vec<Long, 64> &a)
1168 return _mm512_sub_epi64(_mm512_setzero_si512(), a);
1171static SIMD_INLINE Vec<Float, 64> neg(
const Vec<Float, 64> &a)
1174 return _mm512_castsi512_ps(
1175 _mm512_xor_si512(_mm512_set1_epi32(0x80000000), _mm512_castps_si512(a)));
1178static SIMD_INLINE Vec<Double, 64> neg(
const Vec<Double, 64> &a)
1181 return _mm512_castsi512_pd(_mm512_xor_si512(
1182 _mm512_set1_epi64(0x8000000000000000), _mm512_castpd_si512(a)));
1191static SIMD_INLINE Vec<Byte, 64> min(
const Vec<Byte, 64> &a,
1192 const Vec<Byte, 64> &b)
1194 return _mm512_min_epu8(a, b);
1197static SIMD_INLINE Vec<SignedByte, 64> min(
const Vec<SignedByte, 64> &a,
1198 const Vec<SignedByte, 64> &b)
1200 return _mm512_min_epi8(a, b);
1203static SIMD_INLINE Vec<Word, 64> min(
const Vec<Word, 64> &a,
1204 const Vec<Word, 64> &b)
1206 return _mm512_min_epu16(a, b);
1209static SIMD_INLINE Vec<Short, 64> min(
const Vec<Short, 64> &a,
1210 const Vec<Short, 64> &b)
1212 return _mm512_min_epi16(a, b);
1218template <
typename T>
1219static SIMD_INLINE Vec<T, 64> min(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
1221 return Vec<T, 64>(min(a.lo(), b.lo()), min(a.hi(), b.hi()));
1226static SIMD_INLINE Vec<Int, 64> min(
const Vec<Int, 64> &a,
1227 const Vec<Int, 64> &b)
1229 return _mm512_min_epi32(a, b);
1235static SIMD_INLINE Vec<Long, 64> min(
const Vec<Long, 64> &a,
1236 const Vec<Long, 64> &b)
1238 return _mm512_min_epi64(a, b);
1241static SIMD_INLINE Vec<Float, 64> min(
const Vec<Float, 64> &a,
1242 const Vec<Float, 64> &b)
1244 return _mm512_min_ps(a, b);
1247static SIMD_INLINE Vec<Double, 64> min(
const Vec<Double, 64> &a,
1248 const Vec<Double, 64> &b)
1250 return _mm512_min_pd(a, b);
1259static SIMD_INLINE Vec<Byte, 64> max(
const Vec<Byte, 64> &a,
1260 const Vec<Byte, 64> &b)
1262 return _mm512_max_epu8(a, b);
1265static SIMD_INLINE Vec<SignedByte, 64> max(
const Vec<SignedByte, 64> &a,
1266 const Vec<SignedByte, 64> &b)
1268 return _mm512_max_epi8(a, b);
1271static SIMD_INLINE Vec<Word, 64> max(
const Vec<Word, 64> &a,
1272 const Vec<Word, 64> &b)
1274 return _mm512_max_epu16(a, b);
1277static SIMD_INLINE Vec<Short, 64> max(
const Vec<Short, 64> &a,
1278 const Vec<Short, 64> &b)
1280 return _mm512_max_epi16(a, b);
1286template <
typename T>
1287static SIMD_INLINE Vec<T, 64> max(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
1289 return Vec<T, 64>(max(a.lo(), b.lo()), max(a.hi(), b.hi()));
1294static SIMD_INLINE Vec<Int, 64> max(
const Vec<Int, 64> &a,
1295 const Vec<Int, 64> &b)
1297 return _mm512_max_epi32(a, b);
1303static SIMD_INLINE Vec<Long, 64> max(
const Vec<Long, 64> &a,
1304 const Vec<Long, 64> &b)
1306 return _mm512_max_epi64(a, b);
1309static SIMD_INLINE Vec<Float, 64> max(
const Vec<Float, 64> &a,
1310 const Vec<Float, 64> &b)
1312 return _mm512_max_ps(a, b);
1315static SIMD_INLINE Vec<Double, 64> max(
const Vec<Double, 64> &a,
1316 const Vec<Double, 64> &b)
1318 return _mm512_max_pd(a, b);
1328static SIMD_INLINE Vec<Float, 64> mul(
const Vec<Float, 64> &a,
1329 const Vec<Float, 64> &b)
1331 return _mm512_mul_ps(a, b);
1334static SIMD_INLINE Vec<Double, 64> mul(
const Vec<Double, 64> &a,
1335 const Vec<Double, 64> &b)
1337 return _mm512_mul_pd(a, b);
1340static SIMD_INLINE Vec<Float, 64> div(
const Vec<Float, 64> &a,
1341 const Vec<Float, 64> &b)
1343 return _mm512_div_ps(a, b);
1346static SIMD_INLINE Vec<Double, 64> div(
const Vec<Double, 64> &a,
1347 const Vec<Double, 64> &b)
1349 return _mm512_div_pd(a, b);
1360template <
typename T>
1361static SIMD_INLINE Vec<T, 64> ceil(
const Vec<T, 64> &a)
1363 static_assert(std::is_integral<T>::value,
"");
1367template <
typename T>
1368static SIMD_INLINE Vec<T, 64> floor(
const Vec<T, 64> &a)
1370 static_assert(std::is_integral<T>::value,
"");
1374template <
typename T>
1375static SIMD_INLINE Vec<T, 64> round(
const Vec<T, 64> &a)
1377 static_assert(std::is_integral<T>::value,
"");
1381template <
typename T>
1382static SIMD_INLINE Vec<T, 64> truncate(
const Vec<T, 64> &a)
1384 static_assert(std::is_integral<T>::value,
"");
1392static SIMD_INLINE Vec<Float, 64> ceil(
const Vec<Float, 64> &a)
1394 return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
1397static SIMD_INLINE Vec<Double, 64> ceil(
const Vec<Double, 64> &a)
1399 return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
1402static SIMD_INLINE Vec<Float, 64> floor(
const Vec<Float, 64> &a)
1404 return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
1407static SIMD_INLINE Vec<Double, 64> floor(
const Vec<Double, 64> &a)
1409 return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
1412static SIMD_INLINE Vec<Float, 64> round(
const Vec<Float, 64> &a)
1414 return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1417static SIMD_INLINE Vec<Double, 64> round(
const Vec<Double, 64> &a)
1419 return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1422static SIMD_INLINE Vec<Float, 64> truncate(
const Vec<Float, 64> &a)
1424 return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1427static SIMD_INLINE Vec<Double, 64> truncate(
const Vec<Double, 64> &a)
1429 return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1440static SIMD_INLINE Vec<Float, 64> rcp(
const Vec<Float, 64> &a)
1447 return _mm512_rcp28_ps(a);
1449 return _mm512_rcp14_ps(a);
1454static SIMD_INLINE Vec<Double, 64> rcp(
const Vec<Double, 64> &a)
1460 return _mm512_rcp28_pd(a);
1462 return _mm512_rcp14_pd(a);
1470static SIMD_INLINE Vec<Float, 64> rsqrt(
const Vec<Float, 64> &a)
1476 return _mm512_rsqrt28_ps(a);
1478 return _mm512_rsqrt14_ps(a);
1483static SIMD_INLINE Vec<Double, 64> rsqrt(
const Vec<Double, 64> &a)
1488 return _mm512_rsqrt28_pd(a);
1490 return _mm512_rsqrt14_pd(a);
1497static SIMD_INLINE Vec<Float, 64> sqrt(
const Vec<Float, 64> &a)
1499 return _mm512_sqrt_ps(a);
1503static SIMD_INLINE Vec<Double, 64> sqrt(
const Vec<Double, 64> &a)
1505 return _mm512_sqrt_pd(a);
1515template <
typename T, SIMD_ENABLE_IF(std::is_
unsigned<T>::value
1516 &&std::is_
integral<T>::value)>
1517static SIMD_INLINE Vec<T, 64> abs(
const Vec<T, 64> &a)
1522static SIMD_INLINE Vec<SignedByte, 64> abs(
const Vec<SignedByte, 64> &a)
1525 return _mm512_abs_epi8(a);
1528 return Vec<SignedByte, 64>(abs(a.lo()), abs(a.hi()));
1532static SIMD_INLINE Vec<Short, 64> abs(
const Vec<Short, 64> &a)
1535 return _mm512_abs_epi16(a);
1538 return Vec<Short, 64>(abs(a.lo()), abs(a.hi()));
1542static SIMD_INLINE Vec<Int, 64> abs(
const Vec<Int, 64> &a)
1544 return _mm512_abs_epi32(a);
1547static SIMD_INLINE Vec<Long, 64> abs(
const Vec<Long, 64> &a)
1549 return _mm512_abs_epi64(a);
1552static SIMD_INLINE Vec<Float, 64> abs(
const Vec<Float, 64> &a)
1554 return _mm512_abs_ps(a);
1557static SIMD_INLINE Vec<Double, 64> abs(
const Vec<Double, 64> &a)
1559 return _mm512_abs_pd(a);
1567template <
typename T>
1568static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1571#ifdef __AVX512VBMI__
1573 __m512i idx = _mm512_set_epi8(
1574 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
1575 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
1576 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66,
1578 return _mm512_permutex2var_epi8(a, idx, b);
1580 return x_mm512_unpacklo_epi8(x_mm512_transpose8x64_epi64(a),
1581 x_mm512_transpose8x64_epi64(b));
1586template <
typename T>
1587static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1593 _mm512_set_epi16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40,
1594 8, 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
1595 return _mm512_permutex2var_epi16(a, idx, b);
1597 return x_mm512_unpacklo_epi16(x_mm512_transpose8x64_epi64(a),
1598 x_mm512_transpose8x64_epi64(b));
1603template <
typename T>
1604static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1608 _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
1609 return _mm512_permutex2var_epi32(a, idx, b);
1613template <
typename T>
1614static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1617 __m512i idx = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
1618 return _mm512_permutex2var_epi64(a, idx, b);
1622template <
typename T>
1623static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1626 __m512i idx = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
1627 return _mm512_permutex2var_epi64(a, idx, b);
1631template <
typename T>
1632static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1635 __m512i idx = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
1636 return _mm512_permutex2var_epi64(a, idx, b);
1640static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1641 const Vec<Float, 64> &b, Part<0>,
1645 _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
1646 return _mm512_permutex2var_ps(a, idx, b);
1650static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1651 const Vec<Float, 64> &b, Part<0>,
1654 __m512i idx = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
1655 return _mm512_castpd_ps(
1656 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1660static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1661 const Vec<Float, 64> &b, Part<0>,
1664 __m512i idx = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
1665 return _mm512_castpd_ps(
1666 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1670static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1671 const Vec<Float, 64> &b, Part<0>,
1674 __m512i idx = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
1675 return _mm512_castpd_ps(
1676 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1680static SIMD_INLINE Vec<Double, 64> unpack(
const Vec<Double, 64> &a,
1681 const Vec<Double, 64> &b, Part<0>,
1684 __m512i idx = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
1685 return _mm512_permutex2var_pd(a, idx, b);
1689static SIMD_INLINE Vec<Double, 64> unpack(
const Vec<Double, 64> &a,
1690 const Vec<Double, 64> &b, Part<0>,
1693 __m512i idx = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
1694 return _mm512_permutex2var_pd(a, idx, b);
1698static SIMD_INLINE Vec<Double, 64> unpack(
const Vec<Double, 64> &a,
1699 const Vec<Double, 64> &b, Part<0>,
1702 __m512i idx = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
1703 return _mm512_permutex2var_pd(a, idx, b);
1711template <
typename T>
1712static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1715#ifdef __AVX512VBMI__
1717 __m512i idx = _mm512_set_epi8(
1718 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
1719 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
1720 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
1721 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
1722 return _mm512_permutex2var_epi8(a, idx, b);
1724 return x_mm512_unpackhi_epi8(x_mm512_transpose8x64_epi64(a),
1725 x_mm512_transpose8x64_epi64(b));
1730template <
typename T>
1731static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1736 __m512i idx = _mm512_set_epi16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26,
1737 57, 25, 56, 24, 55, 23, 54, 22, 53, 21, 52, 20,
1738 51, 19, 50, 18, 49, 17, 48, 16);
1739 return _mm512_permutex2var_epi16(a, idx, b);
1741 return x_mm512_unpackhi_epi16(x_mm512_transpose8x64_epi64(a),
1742 x_mm512_transpose8x64_epi64(b));
1747template <
typename T>
1748static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1751 __m512i idx = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10,
1753 return _mm512_permutex2var_epi32(a, idx, b);
1757template <
typename T>
1758static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1761 __m512i idx = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
1762 return _mm512_permutex2var_epi64(a, idx, b);
1766template <
typename T>
1767static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1770 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
1771 return _mm512_permutex2var_epi64(a, idx, b);
1775template <
typename T>
1776static SIMD_INLINE Vec<T, 64> unpack(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1779 __m512i idx = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
1780 return _mm512_permutex2var_epi64(a, idx, b);
1784static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1785 const Vec<Float, 64> &b, Part<1>,
1788 __m512i idx = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10,
1790 return _mm512_permutex2var_ps(a, idx, b);
1794static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1795 const Vec<Float, 64> &b, Part<1>,
1798 __m512i idx = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
1799 return _mm512_castpd_ps(
1800 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1804static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1805 const Vec<Float, 64> &b, Part<1>,
1808 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
1809 return _mm512_castpd_ps(
1810 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1814static SIMD_INLINE Vec<Float, 64> unpack(
const Vec<Float, 64> &a,
1815 const Vec<Float, 64> &b, Part<1>,
1818 __m512i idx = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
1819 return _mm512_castpd_ps(
1820 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1824static SIMD_INLINE Vec<Double, 64> unpack(
const Vec<Double, 64> &a,
1825 const Vec<Double, 64> &b, Part<1>,
1828 __m512i idx = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
1829 return _mm512_permutex2var_pd(a, idx, b);
1833static SIMD_INLINE Vec<Double, 64> unpack(
const Vec<Double, 64> &a,
1834 const Vec<Double, 64> &b, Part<1>,
1837 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
1838 return _mm512_permutex2var_pd(a, idx, b);
1842static SIMD_INLINE Vec<Double, 64> unpack(
const Vec<Double, 64> &a,
1843 const Vec<Double, 64> &b, Part<1>,
1846 __m512i idx = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
1847 return _mm512_permutex2var_pd(a, idx, b);
1857template <
typename T>
1858static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1861 return x_mm512_unpacklo_epi8(a, b);
1865template <
typename T>
1866static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1869 return x_mm512_unpacklo_epi16(a, b);
1873template <
typename T>
1874static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1877 return _mm512_unpacklo_epi32(a, b);
1881template <
typename T>
1882static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1885 return _mm512_unpacklo_epi64(a, b);
1889template <
typename T>
1890static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1893 __m512i idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
1894 return _mm512_permutex2var_epi64(a, idx, b);
1898template <
typename T>
1899static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1902 return _mm512_shuffle_i32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0));
1906static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
1907 const Vec<Float, 64> &b, Part<0>,
1910 return _mm512_unpacklo_ps(a, b);
1914static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
1915 const Vec<Float, 64> &b, Part<0>,
1918 return _mm512_castpd_ps(
1919 _mm512_unpacklo_pd(_mm512_castps_pd(a), _mm512_castps_pd(b)));
1923static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
1924 const Vec<Float, 64> &b, Part<0>,
1927 __m512i idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
1928 return _mm512_castpd_ps(
1929 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1933static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
1934 const Vec<Float, 64> &b, Part<0>,
1937 return _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0));
1941static SIMD_INLINE Vec<Double, 64> unpack16(
const Vec<Double, 64> &a,
1942 const Vec<Double, 64> &b, Part<0>,
1945 return _mm512_unpacklo_pd(a, b);
1949static SIMD_INLINE Vec<Double, 64> unpack16(
const Vec<Double, 64> &a,
1950 const Vec<Double, 64> &b, Part<0>,
1953 __m512i idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
1954 return _mm512_permutex2var_pd(a, idx, b);
1958static SIMD_INLINE Vec<Double, 64> unpack16(
const Vec<Double, 64> &a,
1959 const Vec<Double, 64> &b, Part<0>,
1962 return _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0));
1970template <
typename T>
1971static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1974 return x_mm512_unpackhi_epi8(a, b);
1978template <
typename T>
1979static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1982 return x_mm512_unpackhi_epi16(a, b);
1986template <
typename T>
1987static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1990 return _mm512_unpackhi_epi32(a, b);
1994template <
typename T>
1995static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
1998 return _mm512_unpackhi_epi64(a, b);
2002template <
typename T>
2003static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
2006 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2);
2007 return _mm512_permutex2var_epi64(a, idx, b);
2011template <
typename T>
2012static SIMD_INLINE Vec<T, 64> unpack16(
const Vec<T, 64> &a,
const Vec<T, 64> &b,
2015 return _mm512_shuffle_i32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2));
2019static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
2020 const Vec<Float, 64> &b, Part<1>,
2023 return _mm512_unpackhi_ps(a, b);
2027static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
2028 const Vec<Float, 64> &b, Part<1>,
2031 return _mm512_castpd_ps(
2032 _mm512_unpackhi_pd(_mm512_castps_pd(a), _mm512_castps_pd(b)));
2036static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
2037 const Vec<Float, 64> &b, Part<1>,
2040 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2);
2041 return _mm512_castpd_ps(
2042 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
2046static SIMD_INLINE Vec<Float, 64> unpack16(
const Vec<Float, 64> &a,
2047 const Vec<Float, 64> &b, Part<1>,
2050 return _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2));
2054static SIMD_INLINE Vec<Double, 64> unpack16(
const Vec<Double, 64> &a,
2055 const Vec<Double, 64> &b, Part<1>,
2058 return _mm512_unpackhi_pd(a, b);
2062static SIMD_INLINE Vec<Double, 64> unpack16(
const Vec<Double, 64> &a,
2063 const Vec<Double, 64> &b, Part<1>,
2066 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2);
2067 return _mm512_permutex2var_pd(a, idx, b);
2071static SIMD_INLINE Vec<Double, 64> unpack16(
const Vec<Double, 64> &a,
2072 const Vec<Double, 64> &b, Part<1>,
2075 return _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2));
2088template <
size_t NUM_ELEMS,
typename T>
2089static SIMD_INLINE
void zip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2090 Vec<T, 64> &l, Vec<T, 64> &h)
2092 l = unpack(a, b, Part<0>(), Bytes<NUM_ELEMS *
sizeof(T)>());
2093 h = unpack(a, b, Part<1>(), Bytes<NUM_ELEMS *
sizeof(T)>());
2103template <
size_t NUM_ELEMS,
typename T>
2104static SIMD_INLINE
void zip16(
const Vec<T, 64> a,
const Vec<T, 64> b,
2105 Vec<T, 64> &l, Vec<T, 64> &h)
2107 l = unpack16(a, b, Part<0>(), Bytes<NUM_ELEMS *
sizeof(T)>());
2108 h = unpack16(a, b, Part<1>(), Bytes<NUM_ELEMS *
sizeof(T)>());
2119template <
typename T>
2120static SIMD_INLINE
void unzip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2121 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<1>)
2123#ifdef __AVX512VBMI__
2124 const __m512i idxL = _mm512_set_epi8(
2125 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
2126 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
2127 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
2128 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2129 const __m512i idxH = _mm512_set_epi8(
2130 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
2131 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
2132 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
2133 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2134 l = _mm512_permutex2var_epi8(a, idxL, b);
2135 h = _mm512_permutex2var_epi8(a, idxH, b);
2137 const __m512i mask = _mm512_set_epi8(
2138 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5,
2139 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8,
2140 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
2141 const __m512i atmp = x_mm512_shuffle_epi8(a, mask);
2142 const __m512i btmp = x_mm512_shuffle_epi8(b, mask);
2143 l = _mm512_permutex2var_epi64(
2144 atmp, _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0), btmp);
2145 h = _mm512_permutex2var_epi64(
2146 atmp, _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1), btmp);
2151template <
typename T>
2152static SIMD_INLINE
void unzip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2153 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<2>)
2156 const __m512i idxL = _mm512_set_epi16(
2157 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
2158 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2159 const __m512i idxH = _mm512_set_epi16(
2160 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
2161 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2162 l = _mm512_permutex2var_epi16(a, idxL, b);
2163 h = _mm512_permutex2var_epi16(a, idxH, b);
2165 const __m512i mask = _mm512_set_epi8(
2166 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6,
2167 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5,
2168 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
2169 const __m512i atmp = x_mm512_shuffle_epi8(a, mask);
2170 const __m512i btmp = x_mm512_shuffle_epi8(b, mask);
2171 l = _mm512_permutex2var_epi64(
2172 atmp, _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0), btmp);
2173 h = _mm512_permutex2var_epi64(
2174 atmp, _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1), btmp);
2179template <
typename T>
2180static SIMD_INLINE
void unzip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2181 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<4>)
2183 const __m512i idxL =
2184 _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2185 const __m512i idxH =
2186 _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2187 l = _mm512_permutex2var_epi32(a, idxL, b);
2188 h = _mm512_permutex2var_epi32(a, idxH, b);
2192template <
typename T>
2193static SIMD_INLINE
void unzip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2194 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<8>)
2196 const __m512i idxL = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
2197 const __m512i idxH = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
2198 l = _mm512_permutex2var_epi64(a, idxL, b);
2199 h = _mm512_permutex2var_epi64(a, idxH, b);
2203template <
typename T>
2204static SIMD_INLINE
void unzip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2205 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<16>)
2207 const __m512i idxL = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0);
2208 const __m512i idxH = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2);
2209 l = _mm512_permutex2var_epi64(a, idxL, b);
2210 h = _mm512_permutex2var_epi64(a, idxH, b);
2214template <
typename T>
2215static SIMD_INLINE
void unzip(
const Vec<T, 64> a,
const Vec<T, 64> b,
2216 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<32>)
2218 l = unpack(a, b, Part<0>(), Bytes<32>());
2219 h = unpack(a, b, Part<1>(), Bytes<32>());
2223static SIMD_INLINE
void unzip(
const Vec<Float, 64> a,
const Vec<Float, 64> b,
2224 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<4>)
2226 const __m512i idxL =
2227 _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2228 const __m512i idxH =
2229 _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2230 l = _mm512_permutex2var_ps(a, idxL, b);
2231 h = _mm512_permutex2var_ps(a, idxH, b);
2235static SIMD_INLINE
void unzip(
const Vec<Float, 64> a,
const Vec<Float, 64> b,
2236 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<8>)
2238 const __m512i idxL = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
2239 const __m512i idxH = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
2240 l = _mm512_castpd_ps(
2241 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxL, _mm512_castps_pd(b)));
2242 h = _mm512_castpd_ps(
2243 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxH, _mm512_castps_pd(b)));
2247static SIMD_INLINE
void unzip(
const Vec<Float, 64> a,
const Vec<Float, 64> b,
2248 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<16>)
2250 const __m512i idxL = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0);
2251 const __m512i idxH = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2);
2252 l = _mm512_castpd_ps(
2253 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxL, _mm512_castps_pd(b)));
2254 h = _mm512_castpd_ps(
2255 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxH, _mm512_castps_pd(b)));
2259static SIMD_INLINE
void unzip(
const Vec<Float, 64> a,
const Vec<Float, 64> b,
2260 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<32>)
2262 l = unpack(a, b, Part<0>(), Bytes<32>());
2263 h = unpack(a, b, Part<1>(), Bytes<32>());
2267static SIMD_INLINE
void unzip(
const Vec<Double, 64> a,
const Vec<Double, 64> b,
2268 Vec<Double, 64> &l, Vec<Double, 64> &h, Bytes<8>)
2270 const __m512i idxL = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
2271 const __m512i idxH = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
2272 l = _mm512_permutex2var_pd(a, idxL, b);
2273 h = _mm512_permutex2var_pd(a, idxH, b);
2277static SIMD_INLINE
void unzip(
const Vec<Double, 64> a,
const Vec<Double, 64> b,
2278 Vec<Double, 64> &l, Vec<Double, 64> &h, Bytes<16>)
2280 const __m512i idxL = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0);
2281 const __m512i idxH = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2);
2282 l = _mm512_permutex2var_pd(a, idxL, b);
2283 h = _mm512_permutex2var_pd(a, idxH, b);
2292static SIMD_INLINE Vec<SignedByte, 64> packs(
const Vec<Short, 64> &a,
2293 const Vec<Short, 64> &b,
2294 OutputType<SignedByte>)
2296 return x_mm512_evenodd8x64_epi64(x_mm512_packs_epi16(a, b));
2299static SIMD_INLINE Vec<Short, 64> packs(
const Vec<Int, 64> &a,
2300 const Vec<Int, 64> &b,
2303 return x_mm512_evenodd8x64_epi64(x_mm512_packs_epi32(a, b));
2306static SIMD_INLINE Vec<Short, 64> packs(
const Vec<Float, 64> &a,
2307 const Vec<Float, 64> &b,
2310 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2311 OutputType<Short>());
2314static SIMD_INLINE Vec<Int, 64> packs(
const Vec<Long, 64> &a,
2315 const Vec<Long, 64> &b, OutputType<Int>)
2317 return _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi64_epi32(a)),
2318 _mm512_cvtsepi64_epi32(b), 1);
2321static SIMD_INLINE Vec<Float, 64> packs(
const Vec<Long, 64> &a,
2322 const Vec<Long, 64> &b,
2326 const __m256d low = _mm256_castps_pd(_mm512_cvtepi64_ps(a));
2327 const __m256d high = _mm256_castps_pd(_mm512_cvtepi64_ps(b));
2330 _mm256_castps_pd(_mm512_cvtpd_ps(cvts(a, OutputType<Double>())));
2331 const __m256d high =
2332 _mm256_castps_pd(_mm512_cvtpd_ps(cvts(b, OutputType<Double>())));
2334 return _mm512_castpd_ps(
2335 _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1));
2338static SIMD_INLINE Vec<Float, 64> packs(
const Vec<Double, 64> &a,
2339 const Vec<Double, 64> &b,
2342 const __m256d low = _mm256_castps_pd(_mm512_cvtpd_ps(a));
2343 const __m256d high = _mm256_castps_pd(_mm512_cvtpd_ps(b));
2344 return _mm512_castpd_ps(
2345 _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1));
2348static SIMD_INLINE Vec<Int, 64> packs(
const Vec<Double, 64> &a,
2349 const Vec<Double, 64> &b, OutputType<Int>)
2351 const __m512d clip = _mm512_set1_pd(std::numeric_limits<Int>::max());
2352 const __m256i low = _mm512_cvtpd_epi32(_mm512_min_pd(clip, a));
2353 const __m256i high = _mm512_cvtpd_epi32(_mm512_min_pd(clip, b));
2354 return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
2360static SIMD_INLINE Vec<Byte, 64> packs(
const Vec<Word, 64> &a,
2361 const Vec<Word, 64> &b, OutputType<Byte>)
2363 const auto aSaturated = min(a, Vec<Word, 64>(_mm512_set1_epi16(0xff)));
2364 const auto bSaturated = min(b, Vec<Word, 64>(_mm512_set1_epi16(0xff)));
2365 return x_mm512_evenodd8x64_epi64(
2366 x_mm512_packus_epi16(aSaturated, bSaturated));
2372static SIMD_INLINE Vec<Byte, 64> packs(
const Vec<Short, 64> &a,
2373 const Vec<Short, 64> &b,
2376 return x_mm512_evenodd8x64_epi64(x_mm512_packus_epi16(a, b));
2380static SIMD_INLINE Vec<Word, 64> packs(
const Vec<Int, 64> &a,
2381 const Vec<Int, 64> &b, OutputType<Word>)
2383 return x_mm512_evenodd8x64_epi64(x_mm512_packus_epi32(a, b));
2386static SIMD_INLINE Vec<Word, 64> packs(
const Vec<Float, 64> &a,
2387 const Vec<Float, 64> &b,
2390 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2391 OutputType<Word>());
2397static SIMD_INLINE Vec<SignedByte, 64> packs(
const Vec<Word, 64> &a,
2398 const Vec<Word, 64> &b,
2399 OutputType<SignedByte>)
2401 return x_mm512_evenodd8x64_epi64(
2402 x_mm512_packs_epi16(min(a, Vec<Word, 64>(_mm512_set1_epi16(0x7f))),
2403 min(b, Vec<Word, 64>(_mm512_set1_epi16(0x7f)))));
2417template <
typename T>
2418static SIMD_INLINE
void extend(
const Vec<T, 64> &vIn, Vec<T, 64> vOut[1])
2425static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2426 Vec<Byte, 64> vOut[1])
2428 vOut[0] = max(vIn, Vec<SignedByte, 64>(_mm512_setzero_si512()));
2431static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn,
2432 Vec<SignedByte, 64> vOut[1])
2434 vOut[0] = min(vIn, Vec<Byte, 64>(_mm512_set1_epi8(0x7f)));
2437static SIMD_INLINE
void extend(
const Vec<Short, 64> &vIn, Vec<Word, 64> vOut[1])
2439 vOut[0] = max(vIn, Vec<Short, 64>(_mm512_setzero_si512()));
2442static SIMD_INLINE
void extend(
const Vec<Word, 64> &vIn, Vec<Short, 64> vOut[1])
2444 vOut[0] = min(vIn, Vec<Word, 64>(_mm512_set1_epi16(0x7fff)));
2453static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2454 Vec<Short, 64> vOut[2])
2457 vOut[0] = _mm512_cvtepi8_epi16(vIn.lo());
2458 vOut[1] = _mm512_cvtepi8_epi16(vIn.hi());
2461 const __m256i lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vIn.lo()));
2463 _mm256_cvtepi8_epi16(_mm256_extractf128_si256(vIn.lo(), 1));
2464 vOut[0] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2467 const __m256i lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vIn.hi()));
2469 _mm256_cvtepi8_epi16(_mm256_extractf128_si256(vIn.hi(), 1));
2470 vOut[1] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2475static SIMD_INLINE
void extend(
const Vec<Short, 64> &vIn, Vec<Int, 64> vOut[2])
2477 vOut[0] = _mm512_cvtepi16_epi32(vIn.lo());
2478 vOut[1] = _mm512_cvtepi16_epi32(vIn.hi());
2481static SIMD_INLINE
void extend(
const Vec<Short, 64> &vIn,
2482 Vec<Float, 64> vOut[2])
2484 vOut[0] = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vIn.lo()));
2485 vOut[1] = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vIn.hi()));
2488static SIMD_INLINE
void extend(
const Vec<Int, 64> &vIn, Vec<Long, 64> vecOut[2])
2490 vecOut[0] = _mm512_cvtepi32_epi64(vIn.lo());
2491 vecOut[1] = _mm512_cvtepi32_epi64(vIn.hi());
2494static SIMD_INLINE
void extend(
const Vec<Int, 64> &vIn,
2495 Vec<Double, 64> vecOut[2])
2497 vecOut[0] = _mm512_cvtepi32_pd(vIn.lo());
2498 vecOut[1] = _mm512_cvtepi32_pd(vIn.hi());
2501static SIMD_INLINE
void extend(
const Vec<Float, 64> &vIn,
2502 Vec<Long, 64> vecOut[2])
2504 const Vec<Float, 64> clipped =
2505 _mm512_min_ps(_mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT64), vIn);
2507 vecOut[0] = _mm512_cvtps_epi64(clipped.lo());
2508 vecOut[1] = _mm512_cvtps_epi64(clipped.hi());
2510 vecOut[0] = cvts(_mm512_cvtps_pd(clipped.lo()), OutputType<Long>());
2511 vecOut[1] = cvts(_mm512_cvtps_pd(clipped.hi()), OutputType<Long>());
2515static SIMD_INLINE
void extend(
const Vec<Float, 64> &vIn,
2516 Vec<Double, 64> vecOut[2])
2518 vecOut[0] = _mm512_cvtps_pd(vIn.lo());
2519 vecOut[1] = _mm512_cvtps_pd(vIn.hi());
2524static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn, Vec<Word, 64> vOut[2])
2527 vOut[0] = unpack(vIn, setzero(OutputType<Byte>(), Integer<64>()), Part<0>(),
2529 vOut[1] = unpack(vIn, setzero(OutputType<Byte>(), Integer<64>()), Part<1>(),
2535static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn, Vec<Short, 64> vOut[2])
2538 vOut[0] = _mm512_cvtepu8_epi16(vIn.lo());
2539 vOut[1] = _mm512_cvtepu8_epi16(vIn.hi());
2542 const __m256i lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vIn.lo()));
2544 _mm256_cvtepu8_epi16(_mm256_extractf128_si256(vIn.lo(), 1));
2545 vOut[0] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2548 const __m256i lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vIn.hi()));
2550 _mm256_cvtepu8_epi16(_mm256_extractf128_si256(vIn.hi(), 1));
2551 vOut[1] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2556static SIMD_INLINE
void extend(
const Vec<Word, 64> &vIn, Vec<Int, 64> vOut[2])
2558 vOut[0] = _mm512_cvtepu16_epi32(vIn.lo());
2559 vOut[1] = _mm512_cvtepu16_epi32(vIn.hi());
2562static SIMD_INLINE
void extend(
const Vec<Word, 64> &vIn, Vec<Float, 64> vOut[2])
2564 vOut[0] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(vIn.lo()));
2565 vOut[1] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(vIn.hi()));
2570static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2571 Vec<Word, 64> vOut[2])
2573 const Vec<SignedByte, 64> saturated =
2574 max(vIn, Vec<SignedByte, 64>(_mm512_setzero_si512()));
2575 vOut[0] = unpack(saturated, setzero(OutputType<SignedByte>(), Integer<64>()),
2576 Part<0>(), Bytes<1>());
2577 vOut[1] = unpack(saturated, setzero(OutputType<SignedByte>(), Integer<64>()),
2578 Part<1>(), Bytes<1>());
2587static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2588 Vec<Int, 64> vOut[4])
2590 vOut[0] = _mm512_cvtepi8_epi32(_mm256_castsi256_si128(vIn.lo()));
2591 vOut[1] = _mm512_cvtepi8_epi32(_mm256_extractf128_si256(vIn.lo(), 1));
2592 vOut[2] = _mm512_cvtepi8_epi32(_mm256_castsi256_si128(vIn.hi()));
2593 vOut[3] = _mm512_cvtepi8_epi32(_mm256_extractf128_si256(vIn.hi(), 1));
2596static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2597 Vec<Float, 64> vOut[4])
2599 Vec<Int, 64> vTmp[4];
2601 for (
size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
2604static SIMD_INLINE
void extend(
const Vec<Short, 64> &vIn, Vec<Long, 64> vOut[4])
2606 vOut[0] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 0));
2607 vOut[1] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2608 vOut[2] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2609 vOut[3] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2612static SIMD_INLINE
void extend(
const Vec<Short, 64> &vIn,
2613 Vec<Double, 64> vOut[4])
2615 vOut[0] = _mm512_cvtepi32_pd(
2616 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 0)));
2617 vOut[1] = _mm512_cvtepi32_pd(
2618 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 1)));
2619 vOut[2] = _mm512_cvtepi32_pd(
2620 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 2)));
2621 vOut[3] = _mm512_cvtepi32_pd(
2622 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 3)));
2627static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn, Vec<Int, 64> vOut[4])
2629 vOut[0] = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(vIn.lo()));
2630 vOut[1] = _mm512_cvtepu8_epi32(_mm256_extractf128_si256(vIn.lo(), 1));
2631 vOut[2] = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(vIn.hi()));
2632 vOut[3] = _mm512_cvtepu8_epi32(_mm256_extractf128_si256(vIn.hi(), 1));
2635static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn, Vec<Float, 64> vOut[4])
2637 Vec<Int, 64> vTmp[4];
2639 for (
size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
2642static SIMD_INLINE
void extend(
const Vec<Word, 64> &vIn, Vec<Long, 64> vOut[4])
2644 vOut[0] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 0));
2645 vOut[1] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2646 vOut[2] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2647 vOut[3] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2650static SIMD_INLINE
void extend(
const Vec<Word, 64> &vIn,
2651 Vec<Double, 64> vOut[4])
2653 vOut[0] = _mm512_cvtepi32_pd(
2654 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 0)));
2655 vOut[1] = _mm512_cvtepi32_pd(
2656 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 1)));
2657 vOut[2] = _mm512_cvtepi32_pd(
2658 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 2)));
2659 vOut[3] = _mm512_cvtepi32_pd(
2660 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 3)));
2669static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2670 Vec<Long, 64> vOut[8])
2672 vOut[0] = _mm512_cvtepi8_epi64(_mm512_castsi512_si128(vIn));
2674 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_castsi512_si128(vIn), 8));
2675 vOut[2] = _mm512_cvtepi8_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2677 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 1), 8));
2678 vOut[4] = _mm512_cvtepi8_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2680 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 2), 8));
2681 vOut[6] = _mm512_cvtepi8_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2683 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 3), 8));
2686static SIMD_INLINE
void extend(
const Vec<SignedByte, 64> &vIn,
2687 Vec<Double, 64> vOut[8])
2689 const __m128i vIn128[4] = {
2690 _mm512_extracti32x4_epi32(vIn, 0),
2691 _mm512_extracti32x4_epi32(vIn, 1),
2692 _mm512_extracti32x4_epi32(vIn, 2),
2693 _mm512_extracti32x4_epi32(vIn, 3),
2696 for (
size_t i = 0; i < 4; i++) {
2697 vOut[i * 2 + 0] = _mm512_cvtepi32_pd(_mm256_cvtepi8_epi32(vIn128[i]));
2699 _mm512_cvtepi32_pd(_mm256_cvtepi8_epi32(_mm_srli_si128(vIn128[i], 8)));
2705static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn, Vec<Long, 64> vOut[8])
2707 vOut[0] = _mm512_cvtepu8_epi64(_mm512_castsi512_si128(vIn));
2709 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_castsi512_si128(vIn), 8));
2710 vOut[2] = _mm512_cvtepu8_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2712 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 1), 8));
2713 vOut[4] = _mm512_cvtepu8_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2715 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 2), 8));
2716 vOut[6] = _mm512_cvtepu8_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2718 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 3), 8));
2721static SIMD_INLINE
void extend(
const Vec<Byte, 64> &vIn,
2722 Vec<Double, 64> vOut[8])
2724 const __m128i vIn128[4] = {
2725 _mm512_extracti32x4_epi32(vIn, 0),
2726 _mm512_extracti32x4_epi32(vIn, 1),
2727 _mm512_extracti32x4_epi32(vIn, 2),
2728 _mm512_extracti32x4_epi32(vIn, 3),
2731 for (
size_t i = 0; i < 4; i++) {
2732 vOut[i * 2 + 0] = _mm512_cvtepi32_pd(_mm256_cvtepu8_epi32(vIn128[i]));
2734 _mm512_cvtepi32_pd(_mm256_cvtepu8_epi32(_mm_srli_si128(vIn128[i], 8)));
2742template <
typename Tout,
typename Tin,
2743 SIMD_ENABLE_IF(
sizeof(Tin) ==
sizeof(Tout)),
2744 SIMD_ENABLE_IF(std::is_floating_point<Tin>::value !=
2745 std::is_floating_point<Tout>::value)>
2746static SIMD_INLINE
void extend(
const Vec<Tin, 64> &vIn, Vec<Tout, 64> vOut[1])
2748 vOut[0] = cvts(vIn, OutputType<Tout>());
2758template <
size_t COUNT>
2759static SIMD_INLINE Vec<Byte, 64> srai(
const Vec<Byte, 64> &a)
2761 const __m512i odd = _mm512_srai_epi16(a, vec::min(COUNT, 7ul));
2762 const __m512i even =
2763 _mm512_srai_epi16(_mm512_slli_epi16(a, 8), vec::min(COUNT, 7ul) + 8);
2764 const __mmask64 mask = __mmask64(0x5555555555555555);
2765 return _mm512_mask_blend_epi8(mask, odd, even);
2768template <
size_t COUNT>
2769static SIMD_INLINE Vec<SignedByte, 64> srai(
const Vec<SignedByte, 64> &a)
2771 const __m512i odd = _mm512_srai_epi16(a, vec::min(COUNT, 7ul));
2772 const __m512i even =
2773 _mm512_srai_epi16(_mm512_slli_epi16(a, 8), vec::min(COUNT, 7ul) + 8);
2774 const __mmask64 mask = __mmask64(0x5555555555555555);
2775 return _mm512_mask_blend_epi8(mask, odd, even);
2778template <
size_t COUNT>
2779static SIMD_INLINE Vec<Word, 64> srai(
const Vec<Word, 64> &a)
2781 return _mm512_srai_epi16(a, vec::min(COUNT, 15ul));
2784template <
size_t COUNT>
2785static SIMD_INLINE Vec<Short, 64> srai(
const Vec<Short, 64> &a)
2787 return _mm512_srai_epi16(a, vec::min(COUNT, 15ul));
2793template <
size_t COUNT,
typename T>
2794static SIMD_INLINE Vec<T, 64> srai(
const Vec<T, 64> &a)
2796 return Vec<T, 64>(srai<COUNT>(a.lo()), srai<COUNT>(a.hi()));
2801template <
size_t COUNT>
2802static SIMD_INLINE Vec<Int, 64> srai(
const Vec<Int, 64> &a)
2804 return _mm512_srai_epi32(a, vec::min(COUNT, 31ul));
2807template <
size_t COUNT>
2808static SIMD_INLINE Vec<Long, 64> srai(
const Vec<Long, 64> &a)
2810 return _mm512_srai_epi64(a, vec::min(COUNT, 63ul));
2817template <
size_t COUNT>
2818static SIMD_INLINE Vec<Byte, 64> srli(
const Vec<Byte, 64> &a)
2820 SIMD_IF_CONSTEXPR (COUNT < 8) {
2823 return _mm512_and_si512(_mm512_set1_epi8((int8_t) (0xff >> COUNT)),
2824 _mm512_srli_epi32(a, COUNT));
2826 return _mm512_setzero_si512();
2830template <
size_t COUNT>
2831static SIMD_INLINE Vec<SignedByte, 64> srli(
const Vec<SignedByte, 64> &a)
2833 SIMD_IF_CONSTEXPR (COUNT < 8) {
2836 return _mm512_and_si512(_mm512_set1_epi8((int8_t) (0xff >> COUNT)),
2837 _mm512_srli_epi32(a, COUNT));
2839 return _mm512_setzero_si512();
2843template <
size_t COUNT>
2844static SIMD_INLINE Vec<Word, 64> srli(
const Vec<Word, 64> &a)
2846 SIMD_IF_CONSTEXPR (COUNT < 32) {
2848 return _mm512_srli_epi16(a, COUNT);
2850 return _mm512_and_si512(_mm512_set1_epi16((int16_t) (0xffff >> COUNT)),
2851 _mm512_srli_epi32(a, COUNT));
2854 return _mm512_setzero_si512();
2858template <
size_t COUNT>
2859static SIMD_INLINE Vec<Short, 64> srli(
const Vec<Short, 64> &a)
2861 SIMD_IF_CONSTEXPR (COUNT < 32) {
2863 return _mm512_srli_epi16(a, COUNT);
2865 return _mm512_and_si512(_mm512_set1_epi16((int16_t) (0xffff >> COUNT)),
2866 _mm512_srli_epi32(a, COUNT));
2869 return _mm512_setzero_si512();
2873template <
size_t COUNT>
2874static SIMD_INLINE Vec<Int, 64> srli(
const Vec<Int, 64> &a)
2876 SIMD_IF_CONSTEXPR (COUNT < 32) {
2877 return _mm512_srli_epi32(a, COUNT);
2879 return _mm512_setzero_si512();
2883template <
size_t COUNT>
2884static SIMD_INLINE Vec<Long, 64> srli(
const Vec<Long, 64> &a)
2886 SIMD_IF_CONSTEXPR (COUNT < 64) {
2887 return _mm512_srli_epi64(a, COUNT);
2889 return _mm512_setzero_si512();
2897template <
size_t COUNT>
2898static SIMD_INLINE Vec<Byte, 64> slli(
const Vec<Byte, 64> &a)
2900 SIMD_IF_CONSTEXPR (COUNT < 8) {
2903 return _mm512_and_si512(
2904 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2905 _mm512_slli_epi32(a, COUNT));
2907 return _mm512_setzero_si512();
2911template <
size_t COUNT>
2912static SIMD_INLINE Vec<SignedByte, 64> slli(
const Vec<SignedByte, 64> &a)
2914 SIMD_IF_CONSTEXPR (COUNT < 8) {
2917 return _mm512_and_si512(
2918 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2919 _mm512_slli_epi32(a, COUNT));
2921 return _mm512_setzero_si512();
2925template <
size_t COUNT>
2926static SIMD_INLINE Vec<Word, 64> slli(
const Vec<Word, 64> &a)
2928 SIMD_IF_CONSTEXPR (COUNT < 16) {
2930 return _mm512_slli_epi16(a, COUNT);
2932 return _mm512_and_si512(
2933 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << COUNT))),
2934 _mm512_slli_epi32(a, COUNT));
2937 return _mm512_setzero_si512();
2941template <
size_t COUNT>
2942static SIMD_INLINE Vec<Short, 64> slli(
const Vec<Short, 64> &a)
2944 SIMD_IF_CONSTEXPR (COUNT < 16) {
2946 return _mm512_slli_epi16(a, COUNT);
2948 return _mm512_and_si512(
2949 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << COUNT))),
2950 _mm512_slli_epi32(a, COUNT));
2953 return _mm512_setzero_si512();
2957template <
size_t COUNT>
2958static SIMD_INLINE Vec<Int, 64> slli(
const Vec<Int, 64> &a)
2960 SIMD_IF_CONSTEXPR (COUNT < 32) {
2961 return _mm512_slli_epi32(a, COUNT);
2963 return _mm512_setzero_si512();
2967template <
size_t COUNT>
2968static SIMD_INLINE Vec<Long, 64> slli(
const Vec<Long, 64> &a)
2970 SIMD_IF_CONSTEXPR (COUNT < 64) {
2971 return _mm512_slli_epi64(a, COUNT);
2973 return _mm512_setzero_si512();
2985static SIMD_INLINE Vec<Byte, 64> sra(
const Vec<Byte, 64> &a,
2986 const uint8_t count)
2990 return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(a, _mm512_setzero_si512()));
2992 __m512i odd = _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
2994 _mm512_sra_epi16(_mm512_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
2995 __mmask64 mask = __mmask64(0x5555555555555555);
2996 return _mm512_mask_blend_epi8(mask, odd, even);
2999static SIMD_INLINE Vec<SignedByte, 64> sra(
const Vec<SignedByte, 64> &a,
3000 const uint8_t count)
3004 return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(a, _mm512_setzero_si512()));
3006 __m512i odd = _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
3008 _mm512_sra_epi16(_mm512_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
3009 __mmask64 mask = __mmask64(0x5555555555555555);
3010 return _mm512_mask_blend_epi8(mask, odd, even);
3013static SIMD_INLINE Vec<Word, 64> sra(
const Vec<Word, 64> &a,
3014 const uint8_t count)
3016 return _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
3019static SIMD_INLINE Vec<Short, 64> sra(
const Vec<Short, 64> &a,
3020 const uint8_t count)
3022 return _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
3028template <
typename T>
3029static SIMD_INLINE Vec<T, 64> sra(
const Vec<T, 64> &a,
const uint8_t count)
3031 return Vec<T, 64>(sra(a.lo(), count), sra(a.hi(), count));
3036static SIMD_INLINE Vec<Int, 64> sra(
const Vec<Int, 64> &a,
const uint8_t count)
3038 return _mm512_sra_epi32(a, _mm_cvtsi32_si128(count));
3041static SIMD_INLINE Vec<Long, 64> sra(
const Vec<Long, 64> &a,
3042 const uint8_t count)
3044 return _mm512_sra_epi64(a, _mm_cvtsi32_si128(count));
3051static SIMD_INLINE Vec<Byte, 64> srl(
const Vec<Byte, 64> &a,
3052 const uint8_t count)
3054 return _mm512_and_si512(_mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3055 _mm512_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3058static SIMD_INLINE Vec<SignedByte, 64> srl(
const Vec<SignedByte, 64> &a,
3059 const uint8_t count)
3061 return _mm512_and_si512(_mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3062 _mm512_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3065static SIMD_INLINE Vec<Word, 64> srl(
const Vec<Word, 64> &a,
3066 const uint8_t count)
3069 return _mm512_srl_epi16(a, _mm_cvtsi32_si128(count));
3071 return _mm512_and_si512(
3072 _mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3073 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff >> count)));
3077static SIMD_INLINE Vec<Short, 64> srl(
const Vec<Short, 64> &a,
3078 const uint8_t count)
3081 return _mm512_srl_epi16(a, _mm_cvtsi32_si128(count));
3083 return _mm512_and_si512(
3084 _mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3085 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff >> count)));
3089static SIMD_INLINE Vec<Int, 64> srl(
const Vec<Int, 64> &a,
const uint8_t count)
3091 return _mm512_srl_epi32(a, _mm_cvtsi32_si128(count));
3094static SIMD_INLINE Vec<Long, 64> srl(
const Vec<Long, 64> &a,
3095 const uint8_t count)
3097 return _mm512_srl_epi64(a, _mm_cvtsi32_si128(count));
3104static SIMD_INLINE Vec<Byte, 64> sll(
const Vec<Byte, 64> &a,
3105 const uint8_t count)
3107 return _mm512_and_si512(
3108 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3109 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3112static SIMD_INLINE Vec<SignedByte, 64> sll(
const Vec<SignedByte, 64> &a,
3113 const uint8_t count)
3115 return _mm512_and_si512(
3116 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3117 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3120static SIMD_INLINE Vec<Word, 64> sll(
const Vec<Word, 64> &a,
3121 const uint8_t count)
3124 return _mm512_sll_epi16(a, _mm_cvtsi32_si128(count));
3126 return _mm512_and_si512(
3127 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3128 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << count))));
3132static SIMD_INLINE Vec<Short, 64> sll(
const Vec<Short, 64> &a,
3133 const uint8_t count)
3136 return _mm512_sll_epi16(a, _mm_cvtsi32_si128(count));
3138 return _mm512_and_si512(
3139 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3140 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << count))));
3144static SIMD_INLINE Vec<Int, 64> sll(
const Vec<Int, 64> &a,
const uint8_t count)
3146 return _mm512_sll_epi32(a, _mm_cvtsi32_si128(count));
3149static SIMD_INLINE Vec<Long, 64> sll(
const Vec<Long, 64> &a,
3150 const uint8_t count)
3152 return _mm512_sll_epi64(a, _mm_cvtsi32_si128(count));
3166template <
typename T>
3167static SIMD_INLINE Vec<T, 64> hadd(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
3170 unzip<1>(a, b, x, y);
3178template <
typename T>
3179static SIMD_INLINE Vec<T, 64> hadds(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
3182 unzip<1>(a, b, x, y);
3190template <
typename T>
3191static SIMD_INLINE Vec<T, 64> hsub(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
3194 unzip<1>(a, b, x, y);
3202template <
typename T>
3203static SIMD_INLINE Vec<T, 64> hsubs(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
3206 unzip<1>(a, b, x, y);
3218template <
size_t AB0,
size_t I0,
size_t AB1,
size_t I1,
size_t AB2,
size_t I2,
3219 size_t AB3,
size_t I3,
typename T>
3220static SIMD_INLINE Vec<T, 64> permute_64_16(
const Vec<T, 64> &a,
3221 const Vec<T, 64> &b)
3224 const __m512i mask = _mm512_set_epi64(
3225 (AB3 << 3) | (2 * I3 + 1), (AB3 << 3) | (2 * I3), (AB2 << 3) | (2 * I2 + 1),
3226 (AB2 << 3) | (2 * I2), (AB1 << 3) | (2 * I1 + 1), (AB1 << 3) | (2 * I1),
3227 (AB0 << 3) | (2 * I0 + 1), (AB0 << 3) | (2 * I0));
3229 const Vec<Int, 64> res = _mm512_permutex2var_epi64(
3230 reinterpret(a, OutputType<Int>()), mask, reinterpret(b, OutputType<Int>()));
3231 return reinterpret(res, OutputType<T>());
3277template <
size_t NB,
typename T>
3278static SIMD_INLINE Vec<T, 64> align_64_16(
const Vec<T, 64> &a,
3279 const Vec<T, 64> &b)
3281 SIMD_IF_CONSTEXPR (NB == 0) {
3283 }
else SIMD_IF_CONSTEXPR (NB == 4) {
3286 return permute_64_16<(NB > 3), (NB % 4), (NB > 2), (NB + 1) % 4, (NB > 1),
3287 (NB + 2) % 4, (NB > 0), (NB + 3) % 4>(a, b);
3292template <
size_t COUNT,
typename T>
3293static SIMD_INLINE Vec<T, 64> alignre(
const Vec<T, 64> &h,
const Vec<T, 64> &l)
3295 const auto byteShift = COUNT *
sizeof(T);
3296 SIMD_IF_CONSTEXPR (byteShift < 128) {
3297 const auto laneShift = byteShift / 16;
3298 const Vec<T, 64> L = (byteShift < 64) ? l : h;
3299 const Vec<T, 64> H =
3300 (byteShift < 64) ? h : setzero(OutputType<T>(), Integer<64>());
3301 const Vec<T, 64> ll = align_64_16<laneShift % 4>(L, H);
3302 const Vec<T, 64> hh = align_64_16<laneShift % 4 + 1>(L, H);
3303 return reinterpret(Vec<Byte, 64>(x_mm512_alignr_epi8<byteShift % 16>(
3304 reinterpret(hh, OutputType<Byte>()),
3305 reinterpret(ll, OutputType<Byte>()))),
3308 return setzero(OutputType<T>(), Integer<64>());
3319template <
size_t COUNT,
typename T>
3320static SIMD_INLINE Vec<T, 64> srle(
const Vec<T, 64> &a)
3322 SIMD_IF_CONSTEXPR (COUNT < Vec<T, 64>::elements) {
3323 return alignre<COUNT>(setzero(OutputType<T>(), Integer<64>()), a);
3325 return setzero(OutputType<T>(), Integer<64>());
3336template <
size_t COUNT,
typename T>
3337static SIMD_INLINE Vec<T, 64> slle(
const Vec<T, 64> &a)
3339 SIMD_IF_CONSTEXPR (COUNT < Vec<T, 64>::elements) {
3340 return alignre<Vec<T, 64>::elements - COUNT>(
3341 a, setzero(OutputType<T>(), Integer<64>()));
3343 return setzero(OutputType<T>(), Integer<64>());
3354template <
size_t ALIGNOFF>
3355static SIMD_INLINE __m512i align_shuffle_512(__m512i lo, __m512i hi,
3358 static_assert(ALIGNOFF < 32,
"");
3359 return x_mm512_shuffle_epi8(x_mm512_alignr_epi8<ALIGNOFF>(hi, lo), mask);
3406template <
size_t N,
typename T>
3407struct Swizzle_64_16;
3412template <
typename T>
3413struct Swizzle_64_16<2, T>
3415 static SIMD_INLINE
void _swizzle_64_16(
const Vec<T, 64> vIn[2],
3418 vOut[0] = permute_64_16<0, 0, 0, 2, 1, 0, 1, 2>(vIn[0], vIn[1]);
3419 vOut[1] = permute_64_16<0, 1, 0, 3, 1, 1, 1, 3>(vIn[0], vIn[1]);
3427template <
typename T>
3428struct Swizzle_64_16<3, T>
3430 static SIMD_INLINE
void _swizzle_64_16(
const Vec<T, 64> vIn[3],
3434 vTmp[0] = permute_64_16<0, 0, 1, 2, 0, 1, 1, 0>(vIn[0], vIn[1]);
3435 vTmp[1] = permute_64_16<0, 3, 1, 2, 0, 1, 1, 0>(vIn[1], vIn[2]);
3436 vTmp[2] = permute_64_16<0, 3, 1, 1, 0, 2, 1, 3>(vIn[0], vIn[2]);
3438 vOut[0] = permute_64_16<0, 0, 1, 0, 0, 1, 1, 1>(vTmp[0], vTmp[2]);
3439 vOut[1] = permute_64_16<0, 2, 0, 3, 1, 0, 1, 1>(vTmp[0], vTmp[1]);
3440 vOut[2] = permute_64_16<1, 2, 0, 2, 0, 3, 1, 3>(vTmp[1], vTmp[2]);
3448template <
typename T>
3449struct Swizzle_64_16<4, T>
3451 static SIMD_INLINE
void _swizzle_64_16(
const Vec<T, 64> vIn[4],
3455 vTmp[0] = permute_64_16<0, 0, 1, 0, 0, 1, 1, 1>(vIn[0], vIn[1]);
3456 vTmp[1] = permute_64_16<0, 2, 1, 2, 0, 3, 1, 3>(vIn[0], vIn[1]);
3457 vTmp[2] = permute_64_16<0, 0, 1, 0, 0, 1, 1, 1>(vIn[2], vIn[3]);
3458 vTmp[3] = permute_64_16<0, 2, 1, 2, 0, 3, 1, 3>(vIn[2], vIn[3]);
3460 vOut[0] = permute_64_16<0, 0, 0, 1, 1, 0, 1, 1>(vTmp[0], vTmp[2]);
3461 vOut[1] = permute_64_16<0, 2, 0, 3, 1, 2, 1, 3>(vTmp[0], vTmp[2]);
3462 vOut[2] = permute_64_16<0, 0, 0, 1, 1, 0, 1, 1>(vTmp[1], vTmp[3]);
3463 vOut[3] = permute_64_16<0, 2, 0, 3, 1, 2, 1, 3>(vTmp[1], vTmp[3]);
3471template <
typename T>
3472struct Swizzle_64_16<5, T>
3474 static SIMD_INLINE
void _swizzle_64_16(
const Vec<T, 64> vIn[5],
3478 vTmp[0] = permute_64_16<0, 1, 1, 2, 0, 2, 1, 3>(vIn[1], vIn[2]);
3479 vTmp[1] = permute_64_16<0, 1, 1, 0, 0, 3, 1, 2>(vIn[0], vIn[4]);
3480 vTmp[2] = permute_64_16<0, 0, 1, 1, 0, 1, 1, 2>(vIn[2], vIn[3]);
3481 vTmp[3] = permute_64_16<0, 3, 1, 1, 0, 0, 1, 3>(vIn[1], vIn[4]);
3482 vTmp[4] = permute_64_16<0, 0, 1, 3, 0, 2, 1, 0>(vIn[0], vIn[3]);
3484 vOut[0] = permute_64_16<1, 0, 0, 0, 0, 1, 1, 1>(vTmp[0], vTmp[4]);
3485 vOut[1] = permute_64_16<1, 0, 0, 2, 0, 3, 1, 1>(vTmp[0], vTmp[1]);
3486 vOut[2] = permute_64_16<1, 2, 0, 0, 1, 3, 0, 1>(vTmp[3], vTmp[4]);
3487 vOut[3] = permute_64_16<0, 2, 1, 0, 1, 1, 0, 3>(vTmp[1], vTmp[2]);
3488 vOut[4] = permute_64_16<1, 2, 0, 2, 0, 3, 1, 3>(vTmp[2], vTmp[3]);
3493template <
size_t N,
typename T>
3494static SIMD_INLINE
void swizzle_64_16(
const Vec<T, 64> vIn[N],
3497 Swizzle_64_16<N, T>::_swizzle_64_16(vIn, vOut);
3509template <
typename T>
3510static SIMD_INLINE
void swizzle(Vec<T, 64>[1], Integer<1>)
3518template <
typename T,
3519 SIMD_ENABLE_IF((
sizeof(T) <= 2 && std::is_integral<T>::value))>
3520static SIMD_INLINE
void swizzle(Vec<T, 64> v[2], Integer<2>)
3523 swizzle_64_16<2>(v, vs);
3524 const __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<2, T>());
3525 const __m512i s[2] = {
3526 x_mm512_shuffle_epi8(vs[0], mask),
3527 x_mm512_shuffle_epi8(vs[1], mask),
3529 v[0] = _mm512_unpacklo_epi64(s[0], s[1]);
3530 v[1] = _mm512_unpackhi_epi64(s[0], s[1]);
3534template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3535static SIMD_INLINE
void swizzle(Vec<T, 64> v[2], Integer<2>)
3537 const Vec<Float, 64> vFloat[2] = {
3538 reinterpret(v[0], OutputType<Float>()),
3539 reinterpret(v[1], OutputType<Float>()),
3541 Vec<Float, 64> vs[2];
3542 swizzle_64_16<2>(vFloat, vs);
3543 const Vec<Float, 64> vOut[2] = {
3544 _mm512_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(2, 0, 2, 0)),
3545 _mm512_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(3, 1, 3, 1)),
3547 v[0] = reinterpret(vOut[0], OutputType<T>());
3548 v[1] = reinterpret(vOut[1], OutputType<T>());
3552template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3554static SIMD_INLINE
void swizzle(Vec<T, 64> v[2], Integer<2>)
3556 const Vec<Double, 64> vDouble[2] = {
3557 reinterpret(v[0], OutputType<Double>()),
3558 reinterpret(v[1], OutputType<Double>()),
3560 Vec<Double, 64> vs[2];
3561 swizzle_64_16<2>(vDouble, vs);
3562 const Vec<Double, 64> vOut[2] = {
3563 _mm512_shuffle_pd(vs[0], vs[1], 0x00),
3564 _mm512_shuffle_pd(vs[0], vs[1], 0xFF),
3566 v[0] = reinterpret(vOut[0], OutputType<T>());
3567 v[1] = reinterpret(vOut[1], OutputType<T>());
3573template <
typename T,
3574 SIMD_ENABLE_IF((
sizeof(T) <= 2 && std::is_integral<T>::value))>
3575static SIMD_INLINE
void swizzle(Vec<T, 64> v[3], Integer<3>)
3578 swizzle_64_16<3>(v, vs);
3579 __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<3, T>());
3580 __m512i s0 = align_shuffle_512<0>(vs[0], vs[1], mask);
3581 __m512i s1 = align_shuffle_512<12>(vs[0], vs[1], mask);
3582 __m512i s2 = align_shuffle_512<8>(vs[1], vs[2], mask);
3583 __m512i s3 = align_shuffle_512<4>(vs[2], _mm512_undefined_epi32(), mask);
3584 __m512i l01 = _mm512_unpacklo_epi32(s0, s1);
3585 __m512i h01 = _mm512_unpackhi_epi32(s0, s1);
3586 __m512i l23 = _mm512_unpacklo_epi32(s2, s3);
3587 __m512i h23 = _mm512_unpackhi_epi32(s2, s3);
3588 v[0] = _mm512_unpacklo_epi64(l01, l23);
3589 v[1] = _mm512_unpackhi_epi64(l01, l23);
3590 v[2] = _mm512_unpacklo_epi64(h01, h23);
3596template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3597static SIMD_INLINE
void swizzle(Vec<T, 64> v[3], Integer<3>)
3599 const Vec<Float, 64> vFloat[3] = {
3600 reinterpret(v[0], OutputType<Float>()),
3601 reinterpret(v[1], OutputType<Float>()),
3602 reinterpret(v[2], OutputType<Float>()),
3604 Vec<Float, 64> vs[3];
3605 swizzle_64_16<3>(vFloat, vs);
3609 __m512 x2y2x3y3 = _mm512_shuffle_ps(vs[1], vs[2], _MM_SHUFFLE(2, 1, 3, 2));
3610 __m512 y0z0y1z1 = _mm512_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(1, 0, 2, 1));
3612 const Vec<Float, 64> vOut0 =
3613 _mm512_shuffle_ps(vs[0], x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
3615 const Vec<Float, 64> vOut1 =
3616 _mm512_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
3618 const Vec<Float, 64> vOut2 =
3619 _mm512_shuffle_ps(y0z0y1z1, vs[2], _MM_SHUFFLE(3, 0, 3, 1));
3620 v[0] = reinterpret(vOut0, OutputType<T>());
3621 v[1] = reinterpret(vOut1, OutputType<T>());
3622 v[2] = reinterpret(vOut2, OutputType<T>());
3626template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3628static SIMD_INLINE
void swizzle(Vec<T, 64> v[3], Integer<3>)
3630 const Vec<Double, 64> vDouble[3] = {
3631 reinterpret(v[0], OutputType<Double>()),
3632 reinterpret(v[1], OutputType<Double>()),
3633 reinterpret(v[2], OutputType<Double>()),
3635 Vec<Double, 64> vs[3];
3636 swizzle_64_16<3>(vDouble, vs);
3637 const Vec<Double, 64> vOut[3] = {
3638 _mm512_shuffle_pd(vs[0], vs[1], 0xaa),
3639 _mm512_shuffle_pd(vs[0], vs[2], 0x55),
3640 _mm512_shuffle_pd(vs[1], vs[2], 0xaa),
3642 v[0] = reinterpret(vOut[0], OutputType<T>());
3643 v[1] = reinterpret(vOut[1], OutputType<T>());
3644 v[2] = reinterpret(vOut[2], OutputType<T>());
3650template <
typename T,
3651 SIMD_ENABLE_IF((
sizeof(T) <= 2 && std::is_integral<T>::value))>
3652static SIMD_INLINE
void swizzle(Vec<T, 64> v[4], Integer<4>)
3655 swizzle_64_16<4>(v, vs);
3656 __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<4, T>());
3658 for (
size_t j = 0; j < 4; j++) s[j] = x_mm512_shuffle_epi8(vs[j], mask);
3659 __m512i l01 = _mm512_unpacklo_epi32(s[0], s[1]);
3660 __m512i h01 = _mm512_unpackhi_epi32(s[0], s[1]);
3661 __m512i l23 = _mm512_unpacklo_epi32(s[2], s[3]);
3662 __m512i h23 = _mm512_unpackhi_epi32(s[2], s[3]);
3663 v[0] = _mm512_unpacklo_epi64(l01, l23);
3664 v[1] = _mm512_unpackhi_epi64(l01, l23);
3665 v[2] = _mm512_unpacklo_epi64(h01, h23);
3666 v[3] = _mm512_unpackhi_epi64(h01, h23);
3670template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3671static SIMD_INLINE
void swizzle(Vec<T, 64> v[4], Integer<4>)
3673 Vec<Int, 64> vInt[4];
3674 for (
size_t i = 0; i < 4; i++) vInt[i] = reinterpret(v[i], OutputType<Int>());
3676 swizzle_64_16<4>(vInt, vs);
3677 const __m512i s[4] = {
3678 _mm512_unpacklo_epi32(vs[0], vs[1]),
3679 _mm512_unpackhi_epi32(vs[0], vs[1]),
3680 _mm512_unpacklo_epi32(vs[2], vs[3]),
3681 _mm512_unpackhi_epi32(vs[2], vs[3]),
3683 const Vec<Int, 64> vOut[4] = {
3684 _mm512_unpacklo_epi64(s[0], s[2]),
3685 _mm512_unpackhi_epi64(s[0], s[2]),
3686 _mm512_unpacklo_epi64(s[1], s[3]),
3687 _mm512_unpackhi_epi64(s[1], s[3]),
3689 for (
size_t i = 0; i < 4; i++) v[i] = reinterpret(vOut[i], OutputType<T>());
3693template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3695static SIMD_INLINE
void swizzle(Vec<T, 64> v[4], Integer<4>)
3697 Vec<Double, 64> vDouble[4];
3698 for (
size_t i = 0; i < 4; i++)
3699 vDouble[i] = reinterpret(v[i], OutputType<Double>());
3700 Vec<Double, 64> vs[4];
3701 swizzle_64_16<4>(vDouble, vs);
3702 const Vec<Double, 64> vOut[4] = {
3703 _mm512_shuffle_pd(vs[0], vs[2], 0x00),
3704 _mm512_shuffle_pd(vs[0], vs[2], 0xFF),
3705 _mm512_shuffle_pd(vs[1], vs[3], 0x00),
3706 _mm512_shuffle_pd(vs[1], vs[3], 0xFF),
3708 for (
size_t i = 0; i < 4; i++) v[i] = reinterpret(vOut[i], OutputType<T>());
3714template <
typename T,
3715 SIMD_ENABLE_IF(
sizeof(T) == 1 && std::is_integral<T>::value)>
3716static SIMD_INLINE
void swizzle(Vec<T, 64> v[5], Integer<5>)
3719 swizzle_64_16<5>(v, vs);
3720 const __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<5, T>());
3721 const __m512i s[8] = {
3722 align_shuffle_512<0>(vs[0], vs[1], mask),
3723 align_shuffle_512<10>(vs[0], vs[1], mask),
3724 align_shuffle_512<4>(vs[1], vs[2], mask),
3725 align_shuffle_512<14>(vs[1], vs[2], mask),
3726 align_shuffle_512<8>(vs[2], vs[3], mask),
3727 align_shuffle_512<2>(vs[3], vs[4], mask),
3728 align_shuffle_512<12>(vs[3], vs[4], mask),
3729 align_shuffle_512<6>(vs[4], _mm512_undefined_epi32(), mask),
3731 __m512i l01 = x_mm512_unpacklo_epi16(s[0], s[1]);
3732 __m512i h01 = x_mm512_unpackhi_epi16(s[0], s[1]);
3733 __m512i l23 = x_mm512_unpacklo_epi16(s[2], s[3]);
3734 __m512i h23 = x_mm512_unpackhi_epi16(s[2], s[3]);
3735 __m512i l45 = x_mm512_unpacklo_epi16(s[4], s[5]);
3736 __m512i h45 = x_mm512_unpackhi_epi16(s[4], s[5]);
3737 __m512i l67 = x_mm512_unpacklo_epi16(s[6], s[7]);
3738 __m512i h67 = x_mm512_unpackhi_epi16(s[6], s[7]);
3739 __m512i ll01l23 = _mm512_unpacklo_epi32(l01, l23);
3740 __m512i hl01l23 = _mm512_unpackhi_epi32(l01, l23);
3741 __m512i ll45l67 = _mm512_unpacklo_epi32(l45, l67);
3742 __m512i hl45l67 = _mm512_unpackhi_epi32(l45, l67);
3743 __m512i lh01h23 = _mm512_unpacklo_epi32(h01, h23);
3744 __m512i lh45h67 = _mm512_unpacklo_epi32(h45, h67);
3745 v[0] = _mm512_unpacklo_epi64(ll01l23, ll45l67);
3746 v[1] = _mm512_unpackhi_epi64(ll01l23, ll45l67);
3747 v[2] = _mm512_unpacklo_epi64(hl01l23, hl45l67);
3748 v[3] = _mm512_unpackhi_epi64(hl01l23, hl45l67);
3749 v[4] = _mm512_unpacklo_epi64(lh01h23, lh45h67);
3753template <
typename T,
3754 SIMD_ENABLE_IF(
sizeof(T) == 2 && std::is_integral<T>::value),
3756static SIMD_INLINE
void swizzle(Vec<T, 64> v[5], Integer<5>)
3759 swizzle_64_16<5>(v, vs);
3760 const __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<5, T>());
3761 const __m512i s[8] = {
3762 align_shuffle_512<0>(vs[0], vs[1], mask),
3763 align_shuffle_512<6>(vs[0], vs[1], mask),
3764 align_shuffle_512<4>(vs[1], vs[2], mask),
3765 align_shuffle_512<10>(vs[1], vs[2], mask),
3766 align_shuffle_512<8>(vs[2], vs[3], mask),
3767 align_shuffle_512<14>(vs[2], vs[3], mask),
3768 align_shuffle_512<12>(vs[3], vs[4], mask),
3769 align_shuffle_512<2>(vs[4], _mm512_undefined_epi32(), mask),
3771 __m512i l02 = _mm512_unpacklo_epi32(s[0], s[2]);
3772 __m512i h02 = _mm512_unpackhi_epi32(s[0], s[2]);
3773 __m512i l13 = _mm512_unpacklo_epi32(s[1], s[3]);
3774 __m512i l46 = _mm512_unpacklo_epi32(s[4], s[6]);
3775 __m512i h46 = _mm512_unpackhi_epi32(s[4], s[6]);
3776 __m512i l57 = _mm512_unpacklo_epi32(s[5], s[7]);
3777 v[0] = _mm512_unpacklo_epi64(l02, l46);
3778 v[1] = _mm512_unpackhi_epi64(l02, l46);
3779 v[2] = _mm512_unpacklo_epi64(h02, h46);
3780 v[3] = _mm512_unpacklo_epi64(l13, l57);
3781 v[4] = _mm512_unpackhi_epi64(l13, l57);
3785template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 4),
typename = void,
3787static SIMD_INLINE
void swizzle(Vec<T, 64> v[5], Integer<5>)
3789 Vec<Int, 64> vInt[5];
3790 for (
size_t i = 0; i < 5; i++) {
3791 vInt[i] = reinterpret(v[i], OutputType<Int>());
3794 swizzle_64_16<5>(vInt, vs);
3801 __m512i s2 = x_mm512_alignr_epi8<4>(vs[2], vs[1]);
3805 __m512i s3 = x_mm512_alignr_epi8<4>(vs[3], vs[2]);
3809 __m512i s4 = x_mm512_alignr_epi8<8>(vs[3], vs[2]);
3813 __m512i s5 = x_mm512_alignr_epi8<8>(vs[4], vs[3]);
3817 __m512i s6 = x_mm512_alignr_epi8<12>(vs[4], vs[3]);
3821 __m512i s7 = x_mm512_alignr_epi8<12>(vs[0], vs[4]);
3823 __m512i l02 = _mm512_unpacklo_epi32(vs[0], s2);
3824 __m512i h02 = _mm512_unpackhi_epi32(vs[0], s2);
3826 __m512i l13 = _mm512_unpacklo_epi32(vs[1], s3);
3828 __m512i l46 = _mm512_unpacklo_epi32(s4, s6);
3829 __m512i h46 = _mm512_unpackhi_epi32(s4, s6);
3831 __m512i l57 = _mm512_unpacklo_epi32(s5, s7);
3832 const Vec<Int, 64> vOut[5] = {
3834 _mm512_unpacklo_epi64(l02, l46),
3835 _mm512_unpackhi_epi64(l02, l46),
3837 _mm512_unpacklo_epi64(h02, h46),
3838 _mm512_unpackhi_epi64(h02, h46),
3840 _mm512_unpacklo_epi64(l13, l57),
3842 for (
size_t i = 0; i < 5; i++) {
3843 v[i] = reinterpret(vOut[i], OutputType<T>());
3848template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3849 typename = void,
typename =
void>
3850static SIMD_INLINE
void swizzle(Vec<T, 64> v[5], Integer<5>)
3852 Vec<Double, 64> vDouble[5];
3853 for (
size_t i = 0; i < 5; i++) {
3854 vDouble[i] = reinterpret(v[i], OutputType<Double>());
3856 Vec<Double, 64> vs[5];
3857 swizzle_64_16<5>(vDouble, vs);
3858 const Vec<Double, 64> vOut[5] = {
3859 _mm512_shuffle_pd(vs[0], vs[2], 0xaa),
3860 _mm512_shuffle_pd(vs[0], vs[3], 0x55),
3861 _mm512_shuffle_pd(vs[1], vs[3], 0xaa),
3862 _mm512_shuffle_pd(vs[1], vs[4], 0x55),
3863 _mm512_shuffle_pd(vs[2], vs[4], 0xaa),
3865 for (
size_t i = 0; i < 5; i++) {
3866 v[i] = reinterpret(vOut[i], OutputType<T>());
3887static SIMD_INLINE Vec<Byte, 64> cmplt(
const Vec<Byte, 64> &a,
3888 const Vec<Byte, 64> &b)
3890 return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(a, b));
3893static SIMD_INLINE Vec<SignedByte, 64> cmplt(
const Vec<SignedByte, 64> &a,
3894 const Vec<SignedByte, 64> &b)
3896 return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(a, b));
3899static SIMD_INLINE Vec<Word, 64> cmplt(
const Vec<Word, 64> &a,
3900 const Vec<Word, 64> &b)
3902 return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(a, b));
3905static SIMD_INLINE Vec<Short, 64> cmplt(
const Vec<Short, 64> &a,
3906 const Vec<Short, 64> &b)
3908 return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(a, b));
3914template <
typename T>
3915static SIMD_INLINE Vec<T, 64> cmplt(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
3917 return Vec<T, 64>(cmplt(a.lo(), b.lo()), cmplt(a.hi(), b.hi()));
3922static SIMD_INLINE Vec<Int, 64> cmplt(
const Vec<Int, 64> &a,
3923 const Vec<Int, 64> &b)
3925 return x_mm512_movm_epi32(_mm512_cmplt_epi32_mask(a, b));
3928static SIMD_INLINE Vec<Long, 64> cmplt(
const Vec<Long, 64> &a,
3929 const Vec<Long, 64> &b)
3931 return x_mm512_movm_epi64(_mm512_cmplt_epi64_mask(a, b));
3934static SIMD_INLINE Vec<Float, 64> cmplt(
const Vec<Float, 64> &a,
3935 const Vec<Float, 64> &b)
3939 return _mm512_castsi512_ps(
3940 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_LT_OS)));
3943static SIMD_INLINE Vec<Double, 64> cmplt(
const Vec<Double, 64> &a,
3944 const Vec<Double, 64> &b)
3948 return _mm512_castsi512_pd(
3949 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_LT_OS)));
3961static SIMD_INLINE Vec<Byte, 64> cmple(
const Vec<Byte, 64> &a,
3962 const Vec<Byte, 64> &b)
3964 return _mm512_movm_epi8(_mm512_cmple_epu8_mask(a, b));
3967static SIMD_INLINE Vec<SignedByte, 64> cmple(
const Vec<SignedByte, 64> &a,
3968 const Vec<SignedByte, 64> &b)
3970 return _mm512_movm_epi8(_mm512_cmple_epi8_mask(a, b));
3973static SIMD_INLINE Vec<Word, 64> cmple(
const Vec<Word, 64> &a,
3974 const Vec<Word, 64> &b)
3976 return _mm512_movm_epi16(_mm512_cmple_epu16_mask(a, b));
3979static SIMD_INLINE Vec<Short, 64> cmple(
const Vec<Short, 64> &a,
3980 const Vec<Short, 64> &b)
3982 return _mm512_movm_epi16(_mm512_cmple_epi16_mask(a, b));
3988template <
typename T>
3989static SIMD_INLINE Vec<T, 64> cmple(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
3991 return Vec<T, 64>(cmple(a.lo(), b.lo()), cmple(a.hi(), b.hi()));
3996static SIMD_INLINE Vec<Int, 64> cmple(
const Vec<Int, 64> &a,
3997 const Vec<Int, 64> &b)
3999 return x_mm512_movm_epi32(_mm512_cmple_epi32_mask(a, b));
4002static SIMD_INLINE Vec<Long, 64> cmple(
const Vec<Long, 64> &a,
4003 const Vec<Long, 64> &b)
4005 return x_mm512_movm_epi64(_mm512_cmple_epi64_mask(a, b));
4008static SIMD_INLINE Vec<Float, 64> cmple(
const Vec<Float, 64> &a,
4009 const Vec<Float, 64> &b)
4013 return _mm512_castsi512_ps(
4014 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_LE_OS)));
4017static SIMD_INLINE Vec<Double, 64> cmple(
const Vec<Double, 64> &a,
4018 const Vec<Double, 64> &b)
4022 return _mm512_castsi512_pd(
4023 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_LE_OS)));
4035static SIMD_INLINE Vec<Byte, 64> cmpeq(
const Vec<Byte, 64> &a,
4036 const Vec<Byte, 64> &b)
4038 return _mm512_movm_epi8(_mm512_cmpeq_epu8_mask(a, b));
4041static SIMD_INLINE Vec<SignedByte, 64> cmpeq(
const Vec<SignedByte, 64> &a,
4042 const Vec<SignedByte, 64> &b)
4044 return _mm512_movm_epi8(_mm512_cmpeq_epi8_mask(a, b));
4047static SIMD_INLINE Vec<Word, 64> cmpeq(
const Vec<Word, 64> &a,
4048 const Vec<Word, 64> &b)
4050 return _mm512_movm_epi16(_mm512_cmpeq_epu16_mask(a, b));
4053static SIMD_INLINE Vec<Short, 64> cmpeq(
const Vec<Short, 64> &a,
4054 const Vec<Short, 64> &b)
4056 return _mm512_movm_epi16(_mm512_cmpeq_epi16_mask(a, b));
4062template <
typename T>
4063static SIMD_INLINE Vec<T, 64> cmpeq(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4065 return Vec<T, 64>(cmpeq(a.lo(), b.lo()), cmpeq(a.hi(), b.hi()));
4070static SIMD_INLINE Vec<Int, 64> cmpeq(
const Vec<Int, 64> &a,
4071 const Vec<Int, 64> &b)
4073 return x_mm512_movm_epi32(_mm512_cmpeq_epi32_mask(a, b));
4076static SIMD_INLINE Vec<Long, 64> cmpeq(
const Vec<Long, 64> &a,
4077 const Vec<Long, 64> &b)
4079 return x_mm512_movm_epi64(_mm512_cmpeq_epi64_mask(a, b));
4082static SIMD_INLINE Vec<Float, 64> cmpeq(
const Vec<Float, 64> &a,
4083 const Vec<Float, 64> &b)
4087 return _mm512_castsi512_ps(
4088 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)));
4091static SIMD_INLINE Vec<Double, 64> cmpeq(
const Vec<Double, 64> &a,
4092 const Vec<Double, 64> &b)
4096 return _mm512_castsi512_pd(
4097 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)));
4109static SIMD_INLINE Vec<Byte, 64> cmpgt(
const Vec<Byte, 64> &a,
4110 const Vec<Byte, 64> &b)
4112 return _mm512_movm_epi8(_mm512_cmpgt_epu8_mask(a, b));
4115static SIMD_INLINE Vec<SignedByte, 64> cmpgt(
const Vec<SignedByte, 64> &a,
4116 const Vec<SignedByte, 64> &b)
4118 return _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(a, b));
4121static SIMD_INLINE Vec<Word, 64> cmpgt(
const Vec<Word, 64> &a,
4122 const Vec<Word, 64> &b)
4124 return _mm512_movm_epi16(_mm512_cmpgt_epu16_mask(a, b));
4127static SIMD_INLINE Vec<Short, 64> cmpgt(
const Vec<Short, 64> &a,
4128 const Vec<Short, 64> &b)
4130 return _mm512_movm_epi16(_mm512_cmpgt_epi16_mask(a, b));
4136template <
typename T>
4137static SIMD_INLINE Vec<T, 64> cmpgt(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4139 return Vec<T, 64>(cmpgt(a.lo(), b.lo()), cmpgt(a.hi(), b.hi()));
4144static SIMD_INLINE Vec<Int, 64> cmpgt(
const Vec<Int, 64> &a,
4145 const Vec<Int, 64> &b)
4147 return x_mm512_movm_epi32(_mm512_cmpgt_epi32_mask(a, b));
4150static SIMD_INLINE Vec<Long, 64> cmpgt(
const Vec<Long, 64> &a,
4151 const Vec<Long, 64> &b)
4153 return x_mm512_movm_epi64(_mm512_cmpgt_epi64_mask(a, b));
4156static SIMD_INLINE Vec<Float, 64> cmpgt(
const Vec<Float, 64> &a,
4157 const Vec<Float, 64> &b)
4161 return _mm512_castsi512_ps(
4162 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_GT_OS)));
4165static SIMD_INLINE Vec<Double, 64> cmpgt(
const Vec<Double, 64> &a,
4166 const Vec<Double, 64> &b)
4170 return _mm512_castsi512_pd(
4171 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_GT_OS)));
4183static SIMD_INLINE Vec<Byte, 64> cmpge(
const Vec<Byte, 64> &a,
4184 const Vec<Byte, 64> &b)
4186 return _mm512_movm_epi8(_mm512_cmpge_epu8_mask(a, b));
4189static SIMD_INLINE Vec<SignedByte, 64> cmpge(
const Vec<SignedByte, 64> &a,
4190 const Vec<SignedByte, 64> &b)
4192 return _mm512_movm_epi8(_mm512_cmpge_epi8_mask(a, b));
4195static SIMD_INLINE Vec<Word, 64> cmpge(
const Vec<Word, 64> &a,
4196 const Vec<Word, 64> &b)
4198 return _mm512_movm_epi16(_mm512_cmpge_epu16_mask(a, b));
4201static SIMD_INLINE Vec<Short, 64> cmpge(
const Vec<Short, 64> &a,
4202 const Vec<Short, 64> &b)
4204 return _mm512_movm_epi16(_mm512_cmpge_epi16_mask(a, b));
4210template <
typename T>
4211static SIMD_INLINE Vec<T, 64> cmpge(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4213 return Vec<T, 64>(cmpge(a.lo(), b.lo()), cmpge(a.hi(), b.hi()));
4218static SIMD_INLINE Vec<Int, 64> cmpge(
const Vec<Int, 64> &a,
4219 const Vec<Int, 64> &b)
4221 return x_mm512_movm_epi32(_mm512_cmpge_epi32_mask(a, b));
4224static SIMD_INLINE Vec<Long, 64> cmpge(
const Vec<Long, 64> &a,
4225 const Vec<Long, 64> &b)
4227 return x_mm512_movm_epi64(_mm512_cmpge_epi64_mask(a, b));
4230static SIMD_INLINE Vec<Float, 64> cmpge(
const Vec<Float, 64> &a,
4231 const Vec<Float, 64> &b)
4235 return _mm512_castsi512_ps(
4236 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_GE_OS)));
4239static SIMD_INLINE Vec<Double, 64> cmpge(
const Vec<Double, 64> &a,
4240 const Vec<Double, 64> &b)
4244 return _mm512_castsi512_pd(
4245 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_GE_OS)));
4257static SIMD_INLINE Vec<Byte, 64> cmpneq(
const Vec<Byte, 64> &a,
4258 const Vec<Byte, 64> &b)
4260 return _mm512_movm_epi8(_mm512_cmpneq_epu8_mask(a, b));
4263static SIMD_INLINE Vec<SignedByte, 64> cmpneq(
const Vec<SignedByte, 64> &a,
4264 const Vec<SignedByte, 64> &b)
4266 return _mm512_movm_epi8(_mm512_cmpneq_epi8_mask(a, b));
4269static SIMD_INLINE Vec<Word, 64> cmpneq(
const Vec<Word, 64> &a,
4270 const Vec<Word, 64> &b)
4272 return _mm512_movm_epi16(_mm512_cmpneq_epu16_mask(a, b));
4275static SIMD_INLINE Vec<Short, 64> cmpneq(
const Vec<Short, 64> &a,
4276 const Vec<Short, 64> &b)
4278 return _mm512_movm_epi16(_mm512_cmpneq_epi16_mask(a, b));
4284template <
typename T>
4285static SIMD_INLINE Vec<T, 64> cmpneq(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4287 return Vec<T, 64>(cmpneq(a.lo(), b.lo()), cmpneq(a.hi(), b.hi()));
4292static SIMD_INLINE Vec<Int, 64> cmpneq(
const Vec<Int, 64> &a,
4293 const Vec<Int, 64> &b)
4295 return x_mm512_movm_epi32(_mm512_cmpneq_epi32_mask(a, b));
4298static SIMD_INLINE Vec<Long, 64> cmpneq(
const Vec<Long, 64> &a,
4299 const Vec<Long, 64> &b)
4301 return x_mm512_movm_epi64(_mm512_cmpneq_epi64_mask(a, b));
4304static SIMD_INLINE Vec<Float, 64> cmpneq(
const Vec<Float, 64> &a,
4305 const Vec<Float, 64> &b)
4309 return _mm512_castsi512_ps(
4310 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_NEQ_OQ)));
4313static SIMD_INLINE Vec<Double, 64> cmpneq(
const Vec<Double, 64> &a,
4314 const Vec<Double, 64> &b)
4318 return _mm512_castsi512_pd(
4319 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_NEQ_OQ)));
4333template <
typename T, SIMD_ENABLE_IF(sizeof(T) <= 2)>
4334static SIMD_INLINE Vec<T, 64> ifelse(const Vec<T, 64> &cond,
4335 const Vec<T, 64> &trueVal,
4336 const Vec<T, 64> &falseVal)
4345 const __mmask64 condReg =
4346 _mm512_movepi8_mask(re
interpret(cond, OutputType<Byte>()));
4348 const __m512i trueReg = (__m512i) re
interpret(trueVal, OutputType<Byte>());
4349 const __m512i falseReg = (__m512i) re
interpret(falseVal, OutputType<Byte>());
4350 const Vec<Byte, 64> res = _mm512_mask_blend_epi8(condReg, falseReg, trueReg);
4352 const Vec<Byte, 64> res = _mm512_or_si512(
4353 _mm512_and_si512(re
interpret(cond, OutputType<Byte>()),
4354 re
interpret(trueVal, OutputType<Byte>())),
4355 _mm512_andnot_si512(re
interpret(cond, OutputType<Byte>()),
4356 re
interpret(falseVal, OutputType<Byte>())));
4358 return re
interpret(res, OutputType<T>());
4362template <
typename T, SIMD_ENABLE_IF(sizeof(T) > 2),
typename =
void>
4363static SIMD_INLINE Vec<T, 64> ifelse(
const Vec<T, 64> &cond,
4364 const Vec<T, 64> &trueVal,
4365 const Vec<T, 64> &falseVal)
4374 const __mmask16 condReg =
4375 _mm512_movepi32_mask(reinterpret(cond, OutputType<Int>()));
4377 const __m512i trueReg = (__m512i) reinterpret(trueVal, OutputType<Int>());
4378 const __m512i falseReg = (__m512i) reinterpret(falseVal, OutputType<Int>());
4379 const Vec<Int, 64> res = _mm512_mask_blend_epi32(condReg, falseReg, trueReg);
4381 const Vec<Int, 64> res = _mm512_or_si512(
4382 _mm512_and_si512(reinterpret(cond, OutputType<Int>()),
4383 reinterpret(trueVal, OutputType<Int>())),
4384 _mm512_andnot_si512(reinterpret(cond, OutputType<Int>()),
4385 reinterpret(falseVal, OutputType<Int>())));
4387 return reinterpret(res, OutputType<T>());
4394template <
typename T>
4395static SIMD_INLINE Vec<T, 64> bit_and(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4398 const Vec<Byte, 64> res = _mm512_and_si512(
4399 reinterpret(a, OutputType<Byte>()), reinterpret(b, OutputType<Byte>()));
4400 return reinterpret(res, OutputType<T>());
4407template <
typename T>
4408static SIMD_INLINE Vec<T, 64> bit_or(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4411 const Vec<Byte, 64> res = _mm512_or_si512(reinterpret(a, OutputType<Byte>()),
4412 reinterpret(b, OutputType<Byte>()));
4413 return reinterpret(res, OutputType<T>());
4420template <
typename T>
4421static SIMD_INLINE Vec<T, 64> bit_andnot(
const Vec<T, 64> &a,
4422 const Vec<T, 64> &b)
4425 const Vec<Byte, 64> res = _mm512_andnot_si512(
4426 reinterpret(a, OutputType<Byte>()), reinterpret(b, OutputType<Byte>()));
4427 return reinterpret(res, OutputType<T>());
4434template <
typename T>
4435static SIMD_INLINE Vec<T, 64> bit_xor(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4438 const Vec<Byte, 64> res = _mm512_xor_si512(
4439 reinterpret(a, OutputType<Byte>()), reinterpret(b, OutputType<Byte>()));
4440 return reinterpret(res, OutputType<T>());
4448template <
typename T>
4449static SIMD_INLINE Vec<T, 64> bit_not(
const Vec<T, 64> &a)
4453 const Vec<Byte, 64> res =
4454 _mm512_xor_si512(reinterpret(a, OutputType<Byte>()), _mm512_set1_epi32(-1));
4455 return reinterpret(res, OutputType<T>());
4464static SIMD_INLINE Vec<Byte, 64> avg(
const Vec<Byte, 64> &a,
4465 const Vec<Byte, 64> &b)
4467 return _mm512_avg_epu8(a, b);
4472static SIMD_INLINE Vec<SignedByte, 64> avg(
const Vec<SignedByte, 64> &a,
4473 const Vec<SignedByte, 64> &b)
4476 const __m512i signbit = _mm512_set1_epi8(int8_t(0x80));
4477 const __m512i a1 = _mm512_xor_si512(a, signbit);
4478 const __m512i b1 = _mm512_xor_si512(b, signbit);
4479 const __m512i m1 = _mm512_avg_epu8(a1, b1);
4480 return _mm512_xor_si512(m1, signbit);
4483static SIMD_INLINE Vec<Word, 64> avg(
const Vec<Word, 64> &a,
4484 const Vec<Word, 64> &b)
4486 return _mm512_avg_epu16(a, b);
4491static SIMD_INLINE Vec<Short, 64> avg(
const Vec<Short, 64> &a,
4492 const Vec<Short, 64> &b)
4495 const __m512i signbit = _mm512_set1_epi16(int16_t(0x8000));
4496 const __m512i a1 = _mm512_xor_si512(a, signbit);
4497 const __m512i b1 = _mm512_xor_si512(b, signbit);
4498 const __m512i m1 = _mm512_avg_epu16(a1, b1);
4499 return _mm512_xor_si512(m1, signbit);
4505template <
typename T>
4506static SIMD_INLINE Vec<T, 64> avg(
const Vec<T, 64> &a,
const Vec<T, 64> &b)
4508 return Vec<T, 64>(avg(a.lo(), b.lo()), avg(a.hi(), b.hi()));
4515static SIMD_INLINE Vec<Int, 64> avg(
const Vec<Int, 64> &a,
4516 const Vec<Int, 64> &b)
4518 const auto halfA = _mm512_srai_epi32(a, 1);
4519 const auto halfB = _mm512_srai_epi32(b, 1);
4520 const auto sum = _mm512_add_epi32(halfA, halfB);
4522 _mm512_and_si512(_mm512_or_si512(a, b), _mm512_set1_epi32(1));
4523 return _mm512_add_epi32(lsb, sum);
4528static SIMD_INLINE Vec<Long, 64> avg(
const Vec<Long, 64> &a,
4529 const Vec<Long, 64> &b)
4531 const auto halfA = _mm512_srai_epi64(a, 1);
4532 const auto halfB = _mm512_srai_epi64(b, 1);
4533 const auto sum = _mm512_add_epi64(halfA, halfB);
4535 _mm512_and_si512(_mm512_or_si512(a, b), _mm512_set1_epi64(1));
4536 return _mm512_add_epi64(lsb, sum);
4540static SIMD_INLINE Vec<Float, 64> avg(
const Vec<Float, 64> &a,
4541 const Vec<Float, 64> &b)
4543 return _mm512_mul_ps(_mm512_add_ps(a, b), _mm512_set1_ps(0.5f));
4547static SIMD_INLINE Vec<Double, 64> avg(
const Vec<Double, 64> &a,
4548 const Vec<Double, 64> &b)
4550 return _mm512_mul_pd(_mm512_add_pd(a, b), _mm512_set1_pd(0.5));
4557template <
typename T>
4558static SIMD_INLINE
bool test_all_zeros(
const Vec<T, 64> &a)
4560 const auto intA = reinterpret(a, OutputType<Int>());
4561 return _mm512_test_epi32_mask(intA, intA) == 0;
4570template <
typename T>
4571static SIMD_INLINE
bool test_all_ones(
const Vec<T, 64> &a)
4573 return test_all_zeros(bit_not(a));
4580template <
typename T, SIMD_ENABLE_IF(sizeof(T) <= 2)>
4581static SIMD_INLINE Vec<T, 64> reverse(const Vec<T, 64> &a)
4584 SIMD_IF_CONSTEXPR (sizeof(T) == 1) {
4585 mask = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4586 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
4587 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
4588 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
4589 55, 56, 57, 58, 59, 60, 61, 62, 63);
4591 mask = _mm512_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
4592 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29,
4593 28, 31, 30, 33, 32, 35, 34, 37, 36, 39, 38, 41, 40,
4594 43, 42, 45, 44, 47, 46, 49, 48, 51, 50, 53, 52, 55,
4595 54, 57, 56, 59, 58, 61, 60, 63, 62);
4597#ifdef __AVX512VBMI__
4598 return _mm512_permutexvar_epi8(mask, a);
4600 const Vec<T, 64> r = x_mm512_shuffle_epi8(a, mask);
4601 return _mm512_permutexvar_epi64(_mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6), r);
4605static SIMD_INLINE Vec<Int, 64> reverse(const Vec<Int, 64> &a)
4608 _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4609 return _mm512_permutexvar_epi32(mask, a);
4612static SIMD_INLINE Vec<Long, 64> reverse(const Vec<Long, 64> &a)
4614 return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
4618static SIMD_INLINE Vec<Float, 64> reverse(const Vec<Float, 64> &a)
4621 _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4622 return _mm512_permutexvar_ps(mask, a);
4626static SIMD_INLINE Vec<Double, 64> reverse(const Vec<Double, 64> &a)
4628 return _mm512_permutexvar_pd(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
4637static SIMD_INLINE u
int64_t msb2
int(const Vec<Int, 64> &a)
4640 return _mm512_movepi32_mask(a);
4642 const __m512i mask = _mm512_set1_epi32(u
int32_t(0x80000000));
4643 return _mm512_test_epi32_mask(a, mask);
4647static SIMD_INLINE u
int64_t msb2
int(const Vec<Long, 64> &a)
4650 return _mm512_movepi64_mask(a);
4652 const __m512i mask = _mm512_set1_epi64(u
int64_t(0x8000000000000000));
4653 return _mm512_test_epi64_mask(a, mask);
4657static SIMD_INLINE u
int64_t msb2
int(const Vec<Float, 64> &a)
4660 return _mm512_movepi32_mask(_mm512_castps_si512(a));
4662 const __m512i mask = _mm512_set1_epi32(0x80000000);
4663 return _mm512_test_epi32_mask(_mm512_castps_si512(a), mask);
4667static SIMD_INLINE u
int64_t msb2
int(const Vec<Double, 64> &a)
4670 return u
int64_t(_mm512_movepi64_mask(_mm512_castpd_si512(a)));
4673 const __m512i mask = _mm512_set1_epi64(0x8000000000000000);
4675 return _mm512_test_epi64_mask(_mm512_castpd_si512(a), mask);
4681static SIMD_INLINE u
int64_t
interleave_u
int32_with_zeros(u
int32_t input)
4683 u
int64_t word = input;
4684 word = (word ^ (word << 16)) & 0x0000ffff0000ffff;
4685 word = (word ^ (word << 8)) & 0x00ff00ff00ff00ff;
4686 word = (word ^ (word << 4)) & 0x0f0f0f0f0f0f0f0f;
4687 word = (word ^ (word << 2)) & 0x3333333333333333;
4688 word = (word ^ (word << 1)) & 0x5555555555555555;
4692static SIMD_INLINE u
int64_t msb2
int(const Vec<Byte, 64> &a)
4695 return _mm512_movepi8_mask(a);
4697 const u
int64_t part3 = msb2
int(re
interpret(a, OutputType<Int>()));
4698 const u
int64_t part2 = msb2
int(re
interpret(slle<1>(a), OutputType<Int>()));
4699 const u
int64_t part1 = msb2
int(re
interpret(slle<2>(a), OutputType<Int>()));
4700 const u
int64_t part0 = msb2
int(re
interpret(slle<3>(a), OutputType<Int>()));
4703 const u
int64_t part3_with_zeros =
4704 interleave_u
int32_with_zeros(
interleave_u
int32_with_zeros(part3));
4705 const u
int64_t part2_with_zeros =
4706 interleave_u
int32_with_zeros(
interleave_u
int32_with_zeros(part2));
4707 const u
int64_t part1_with_zeros =
4708 interleave_u
int32_with_zeros(
interleave_u
int32_with_zeros(part1));
4709 const u
int64_t part0_with_zeros =
4710 interleave_u
int32_with_zeros(
interleave_u
int32_with_zeros(part0));
4711 return part0_with_zeros | (part1_with_zeros << 1) | (part2_with_zeros << 2) |
4712 (part3_with_zeros << 3);
4716static SIMD_INLINE u
int64_t msb2
int(const Vec<SignedByte, 64> &a)
4718 return msb2
int(re
interpret(a, OutputType<Byte>()));
4721static SIMD_INLINE u
int64_t msb2
int(const Vec<Short, 64> &a)
4724 return _mm512_movepi16_mask(a);
4726 const u
int64_t odd = msb2
int(re
interpret(a, OutputType<Int>()));
4727 const u
int64_t even = msb2
int(re
interpret(slle<1>(a), OutputType<Int>()));
4728 return
interleave_u
int32_with_zeros(even) |
4729 (
interleave_u
int32_with_zeros(odd) << 1);
4733static SIMD_INLINE u
int64_t msb2
int(const Vec<Word, 64> &a)
4735 return msb2
int(re
interpret(a, OutputType<Short>()));
4744static SIMD_INLINE Vec<Byte, 64>
int2msb(const u
int64_t a, OutputType<Byte>,
4748 return _mm512_maskz_set1_epi8(__mmask64(a), (
int8_t) 0x80);
4750 const __m256i shuffleIndeces = _mm256_set_epi64x(
4751 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
4752 const __m256i aVecLo =
4753 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
4754 const __m256i aVecHi =
4755 _mm256_shuffle_epi8(_mm256_set1_epi32(a >> 32), shuffleIndeces);
4756 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
4757 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4758 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4759 const __m256i resultLo = _mm256_cmpeq_epi8(selectedLo, sel);
4760 const __m256i resultHi = _mm256_cmpeq_epi8(selectedHi, sel);
4761 const __m512i result =
4762 _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4763 return _mm512_and_si512(result, _mm512_set1_epi32(0x80808080));
4767static SIMD_INLINE Vec<SignedByte, 64>
int2msb(const u
int64_t a,
4768 OutputType<SignedByte>,
4771 return re
interpret(
int2msb(a, OutputType<Byte>(), Integer<64>()),
4772 OutputType<SignedByte>());
4775static SIMD_INLINE Vec<Short, 64>
int2msb(const u
int64_t a, OutputType<Short>,
4779 return _mm512_maskz_set1_epi16(__mmask32(a), (
int16_t) 0x8000);
4781 const __m256i sel = _mm256_set_epi16(
4782 (
int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
4783 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
4784 const __m256i aVecLo = _mm256_set1_epi16(a);
4785 const __m256i aVecHi = _mm256_set1_epi16(a >> 16);
4786 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4787 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4788 const __m256i resultLo = _mm256_cmpeq_epi16(selectedLo, sel);
4789 const __m256i resultHi = _mm256_cmpeq_epi16(selectedHi, sel);
4790 const __m512i result =
4791 _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4792 return _mm512_and_si512(result, _mm512_set1_epi32(0x80008000));
4796static SIMD_INLINE Vec<Word, 64>
int2msb(const u
int64_t a, OutputType<Word>,
4799 return re
interpret(
int2msb(a, OutputType<Short>(), Integer<64>()),
4800 OutputType<Word>());
4803static SIMD_INLINE Vec<Int, 64>
int2msb(const u
int64_t a, OutputType<Int>,
4806 return _mm512_maskz_set1_epi32(__mmask16(a), 0x80000000);
4809static SIMD_INLINE Vec<Long, 64>
int2msb(const u
int64_t a, OutputType<Long>,
4812 return _mm512_maskz_set1_epi64(__mmask8(a), 0x8000000000000000);
4815static SIMD_INLINE Vec<Float, 64>
int2msb(const u
int64_t a, OutputType<Float>,
4818 return re
interpret(
int2msb(a, OutputType<Int>(), Integer<64>()),
4819 OutputType<Float>());
4822static SIMD_INLINE Vec<Double, 64>
int2msb(const u
int64_t a, OutputType<Double>,
4825 return _mm512_castsi512_pd(
4826 _mm512_maskz_set1_epi64(__mmask8(a), 0x8000000000000000));
4835static SIMD_INLINE Vec<Byte, 64>
int2bits(const u
int64_t a, OutputType<Byte>,
4839 return _mm512_maskz_set1_epi8(__mmask64(a), (
int8_t) 0xff);
4841 const __m256i shuffleIndeces = _mm256_set_epi64x(
4842 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
4843 const __m256i aVecLo =
4844 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
4845 const __m256i aVecHi =
4846 _mm256_shuffle_epi8(_mm256_set1_epi32(a >> 32), shuffleIndeces);
4847 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
4848 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4849 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4850 const __m256i resultLo = _mm256_cmpeq_epi8(selectedLo, sel);
4851 const __m256i resultHi = _mm256_cmpeq_epi8(selectedHi, sel);
4852 return _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4856static SIMD_INLINE Vec<SignedByte, 64>
int2bits(const u
int64_t a,
4857 OutputType<SignedByte>,
4860 return re
interpret(
int2bits(a, OutputType<Byte>(), Integer<64>()),
4861 OutputType<SignedByte>());
4864static SIMD_INLINE Vec<Short, 64>
int2bits(const u
int64_t a, OutputType<Short>,
4868 return _mm512_maskz_set1_epi16(__mmask32(a), (
int16_t) 0xffff);
4870 const __m256i sel = _mm256_set_epi16(
4871 (
int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
4872 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
4873 const __m256i aVecLo = _mm256_set1_epi16(a);
4874 const __m256i aVecHi = _mm256_set1_epi16(a >> 16);
4875 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4876 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4877 const __m256i resultLo = _mm256_cmpeq_epi16(selectedLo, sel);
4878 const __m256i resultHi = _mm256_cmpeq_epi16(selectedHi, sel);
4879 return _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4883static SIMD_INLINE Vec<Word, 64>
int2bits(const u
int64_t a, OutputType<Word>,
4886 return re
interpret(
int2bits(a, OutputType<Short>(), Integer<64>()),
4887 OutputType<Word>());
4890static SIMD_INLINE Vec<Int, 64>
int2bits(const u
int64_t a, OutputType<Int>,
4893 return _mm512_maskz_set1_epi32(__mmask16(a), 0xffffffff);
4896static SIMD_INLINE Vec<Long, 64>
int2bits(const u
int64_t a, OutputType<Long>,
4899 return _mm512_maskz_set1_epi64(__mmask8(a), 0xffffffffffffffff);
4902static SIMD_INLINE Vec<Float, 64>
int2bits(const u
int64_t a, OutputType<Float>,
4905 return re
interpret(
int2bits(a, OutputType<Int>(), Integer<64>()),
4906 OutputType<Float>());
4909static SIMD_INLINE Vec<Double, 64>
int2bits(const u
int64_t a,
4910 OutputType<Double>, Integer<64>)
4912 return _mm512_castsi512_pd(
4913 _mm512_maskz_set1_epi64(__mmask8(a), 0xffffffffffffffff));
4922static SIMD_INLINE Vec<Byte, 64> iota(OutputType<Byte>, Integer<64>)
4924 return _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50,
4925 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,
4926 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22,
4927 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,
4928 7, 6, 5, 4, 3, 2, 1, 0);
4931static SIMD_INLINE Vec<SignedByte, 64> iota(OutputType<SignedByte>, Integer<64>)
4933 return _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50,
4934 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,
4935 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22,
4936 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,
4937 7, 6, 5, 4, 3, 2, 1, 0);
4940static SIMD_INLINE Vec<Short, 64> iota(OutputType<Short>, Integer<64>)
4942 return _mm512_set_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
4943 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
4947static SIMD_INLINE Vec<Word, 64> iota(OutputType<Word>, Integer<64>)
4949 return _mm512_set_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
4950 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
4954static SIMD_INLINE Vec<Int, 64> iota(OutputType<Int>, Integer<64>)
4956 return _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4959static SIMD_INLINE Vec<Long, 64> iota(OutputType<Long>, Integer<64>)
4961 return _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
4964static SIMD_INLINE Vec<Float, 64> iota(OutputType<Float>, Integer<64>)
4966 return _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
4967 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
4970static SIMD_INLINE Vec<Double, 64> iota(OutputType<Double>, Integer<64>)
4972 return _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161