34#ifndef SIMD_VEC_BASE_IMPL_INTEL_32_H_
35#define SIMD_VEC_BASE_IMPL_INTEL_32_H_
41#include "base_impl_intel16.H"
42#include "intrins_intel.H"
49#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_32_AVAIL_) && \
50 !defined(SIMDVEC_SANDBOX)
86 __m256i ymm = _mm256_setzero_si256();
90 static constexpr size_t elements = 32 /
sizeof(T);
92 static constexpr size_t bytes = 32;
95 Vec(
const __m256i &x) { ymm = x; }
96 Vec &operator=(
const __m256i &x)
101 operator __m256i()
const {
return ymm; }
103 Vec(
const Vec<T, 16> &lo,
const Vec<T, 16> &hi)
105 ymm = _mm256_set_m128i(hi, lo);
107 SIMD_INLINE Vec<T, 16> lo()
const {
return _mm256_castsi256_si128(ymm); }
108 SIMD_INLINE Vec<T, 16> hi()
const {
return _mm256_extractf128_si256(ymm, 1); }
127 __m256 ymm = _mm256_setzero_ps();
133 static constexpr size_t bytes = 32;
136 Vec(
const __m256 &x) { ymm = x; }
137 Vec &operator=(
const __m256 &x)
142 operator __m256()
const {
return ymm; }
144 Vec(
const Vec<Float, 16> &lo,
const Vec<Float, 16> &hi)
146 ymm = _mm256_set_m128(hi, lo);
148 SIMD_INLINE Vec<Float, 16> lo()
const {
return _mm256_castps256_ps128(ymm); }
149 SIMD_INLINE Vec<Float, 16> hi()
const
151 return _mm256_extractf128_ps(ymm, 1);
177 static constexpr size_t bytes = 32;
180 Vec(
const __m256d &x) { ymm = x; }
181 Vec &operator=(
const __m256d &x)
186 operator __m256d()
const {
return ymm; }
188 Vec(
const Vec<Double, 16> &lo,
const Vec<Double, 16> &hi)
190 ymm = _mm256_set_m128d(hi, lo);
192 SIMD_INLINE Vec<Double, 16> lo()
const {
return _mm256_castpd256_pd128(ymm); }
193 SIMD_INLINE Vec<Double, 16> hi()
const
195 return _mm256_extractf128_pd(ymm, 1);
251template <
size_t N,
size_t I = 0>
254 template <
typename T>
255 static SIMD_INLINE
void _swizzle_32_16(
const Vec<T, 32> vIn[N],
271 _mm256_permute2f128_si256(vIn[I / 2], vIn[(I + N) / 2],
272 _MM_SHUFFLE(0, (2 + (I + N) % 2), 0, (I % 2)));
273 Swizzle_32_16<N, I + 1>::_swizzle_32_16(vIn, vOut);
277 static SIMD_INLINE
void _swizzle_32_16(
const Vec<Float, 32> vIn[N],
278 Vec<Float, 32> vOut[N])
281 _mm256_permute2f128_ps(vIn[I / 2], vIn[(I + N) / 2],
282 _MM_SHUFFLE(0, (2 + (I + N) % 2), 0, (I % 2)));
283 Swizzle_32_16<N, I + 1>::_swizzle_32_16(vIn, vOut);
287 static SIMD_INLINE
void _swizzle_32_16(
const Vec<Double, 32> vIn[N],
288 Vec<Double, 32> vOut[N])
291 _mm256_permute2f128_pd(vIn[I / 2], vIn[(I + N) / 2],
292 _MM_SHUFFLE(0, (2 + (I + N) % 2), 0, (I % 2)));
293 Swizzle_32_16<N, I + 1>::_swizzle_32_16(vIn, vOut);
299struct Swizzle_32_16<N, N>
301 template <
typename T>
302 static SIMD_INLINE
void _swizzle_32_16(
const Vec<T, 32>[N], Vec<T, 32>[N])
308template <
size_t N,
typename T>
309static SIMD_INLINE
void swizzle_32_16(
const Vec<T, 32> vIn[N],
312 Swizzle_32_16<N>::_swizzle_32_16(vIn, vOut);
323template <
size_t COUNT>
324static SIMD_INLINE __m256i x_mm256_alignr_epi8(__m256i h, __m256i l)
327 static_assert(COUNT < 32,
"");
329 return _mm256_alignr_epi8(h, l, COUNT);
333 return _mm256_set_m128i(_mm_alignr_epi8(_mm256_extractf128_si256(h, 1),
334 _mm256_extractf128_si256(l, 1),
336 _mm_alignr_epi8(_mm256_castsi256_si128(h),
337 _mm256_castsi256_si128(l), COUNT));
355template <
size_t COUNT>
356static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a, Range<true, 0, 16>)
362template <
size_t COUNT>
363static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a, Range<false, 0, 16>)
369 __m256i _0h = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1));
378 return x_mm256_alignr_epi8<COUNT>(_0h, a);
382template <
size_t COUNT>
383static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a, Range<true, 16, 32>)
389 __m256i _0h = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1));
395template <
size_t COUNT>
396static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a,
397 Range<false, 16, 32>)
403 __m256i _0h = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1));
408 return _mm256_srli_si256(_0h, COUNT - 16);
410 return _mm256_set_m128i(
411 _mm_srli_si128(_mm256_extractf128_si256(_0h, 1), COUNT - 16),
412 _mm_srli_si128(_mm256_castsi256_si128(_0h), COUNT - 16));
417template <
size_t,
bool AT_LOW_LIM,
size_t LOW_LIM_INCL,
size_t UP_LIM_EXCL>
418static SIMD_INLINE __m256i
419x_mm256_srli256_si256(__m256i, Range<AT_LOW_LIM, LOW_LIM_INCL, UP_LIM_EXCL>)
421 return _mm256_setzero_si256();
425template <
size_t COUNT>
426static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a)
428 return x_mm256_srli256_si256<COUNT>(a, SizeRange<COUNT, 16>());
444template <
size_t COUNT>
445static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a, Range<true, 0, 16>)
451template <
size_t COUNT>
452static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a, Range<false, 0, 16>)
458 __m256i _l0 = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0));
467 return x_mm256_alignr_epi8<16 - COUNT>(a, _l0);
471template <
size_t COUNT>
472static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a, Range<true, 16, 32>)
478 __m256i _l0 = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0));
484template <
size_t COUNT>
485static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a,
486 Range<false, 16, 32>)
492 __m256i _l0 = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0));
497 return _mm256_slli_si256(_l0, COUNT - 16);
499 return _mm256_set_m128i(
500 _mm_slli_si128(_mm256_extractf128_si256(_l0, 1), COUNT - 16),
501 _mm_slli_si128(_mm256_castsi256_si128(_l0), COUNT - 16));
506template <
size_t,
bool AT_LOW_LIM,
size_t LOW_LIM_INCL,
size_t UP_LIM_EXCL>
507static SIMD_INLINE __m256i
508x_mm256_slli256_si256(__m256i, Range<AT_LOW_LIM, LOW_LIM_INCL, UP_LIM_EXCL>)
510 return _mm256_setzero_si256();
514template <
size_t COUNT>
515static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a)
517 return x_mm256_slli256_si256<COUNT>(a, SizeRange<COUNT, 16>());
542template <
size_t COUNT>
543static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i, __m256i low,
553template <
size_t COUNT>
554static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low,
561 _mm256_permute2f128_si256(low, high, _MM_SHUFFLE(0, 2, 0, 1));
570 return x_mm256_alignr_epi8<COUNT>(high0_low1, low);
574template <
size_t COUNT>
575static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low,
582 _mm256_permute2f128_si256(low, high, _MM_SHUFFLE(0, 2, 0, 1));
588template <
size_t COUNT>
589static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low,
590 Range<false, 16, 32>)
596 _mm256_permute2f128_si256(low, high, _MM_SHUFFLE(0, 2, 0, 1));
605 return x_mm256_alignr_epi8<COUNT - 16>(high, high0_low1);
609template <
size_t COUNT>
610static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
620template <
size_t COUNT>
621static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
622 Range<false, 32, 48>)
628 _mm256_permute2f128_si256(high, high, _MM_SHUFFLE(2, 0, 0, 1));
637 return x_mm256_alignr_epi8<COUNT - 32>(null_high1, high);
641template <
size_t COUNT>
642static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
649 _mm256_permute2f128_si256(high, high, _MM_SHUFFLE(2, 0, 0, 1));
655template <
size_t COUNT>
656static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
657 Range<false, 48, 64>)
663 _mm256_permute2f128_si256(high, high, _MM_SHUFFLE(2, 0, 0, 1));
672 return x_mm256_alignr_epi8<COUNT - 48>(_mm256_setzero_si256(), null_high1);
676template <
size_t COUNT,
bool AT_LOW_LIM,
size_t LOW_LIM_INCL,
678static SIMD_INLINE __m256i x_mm256_alignr256_epi8(
679 __m256i, __m256i, Range<AT_LOW_LIM, LOW_LIM_INCL, UP_LIM_EXCL>)
681 return _mm256_setzero_si256();
685template <
size_t COUNT>
686static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low)
688 return x_mm256_alignr256_epi8<COUNT>(high, low, SizeRange<COUNT, 16>());
695static SIMD_INLINE __m256i x_mm256_duplicate_si128(__m128i a)
697 return _mm256_set_m128i(a, a);
708static SIMD_INLINE __m256i x_mm256_transpose4x64_epi64(__m256i a)
711 return _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
721 in = _mm256_castsi256_pd(a);
726 x1 = _mm256_permute_pd(in, _MM_SHUFFLE(0, 0, 1, 1));
731 x2 = _mm256_permute2f128_pd(x1, x1, _MM_SHUFFLE(0, 0, 1, 1));
737 return _mm256_castpd_si256(_mm256_blend_pd(in, x2, _MM_SHUFFLE(0, 0, 1, 2)));
742 __m128i lo = _mm256_castsi256_si128(a);
743 __m128i hi = _mm256_extractf128_si256(a, 1);
744 __m128i loRes = _mm_unpacklo_epi64(lo, hi);
745 __m128i hiRes = _mm_unpackhi_epi64(lo, hi);
746 return _mm256_set_m128i(hiRes, loRes);
752static SIMD_INLINE __m256 x_mm256_transpose4x64_ps(__m256 a)
754 return _mm256_castsi256_ps(
755 x_mm256_transpose4x64_epi64(_mm256_castps_si256(a)));
758static SIMD_INLINE __m256d x_mm256_transpose4x64_pd(__m256d a)
760 return _mm256_castsi256_pd(
761 x_mm256_transpose4x64_epi64(_mm256_castpd_si256(a)));
768static SIMD_INLINE __m256 x_mm256_unpacklo_2ps(__m256 a, __m256 b)
770 return _mm256_castpd_ps(
771 _mm256_unpacklo_pd(_mm256_castps_pd(a), _mm256_castps_pd(b)));
774static SIMD_INLINE __m256 x_mm256_unpackhi_2ps(__m256 a, __m256 b)
776 return _mm256_castpd_ps(
777 _mm256_unpackhi_pd(_mm256_castps_pd(a), _mm256_castps_pd(b)));
786#define SIMDVEC_INTEL_X_INT_BINFCT_32(INTRIN) \
787 static SIMD_INLINE __m256i x_mm256_##INTRIN(__m256i a, __m256i b) \
789 return _mm256_##INTRIN(a, b); \
793#define SIMDVEC_INTEL_X_INT_BINFCT_32(INTRIN) \
794 static SIMD_INLINE __m256i x_mm256_##INTRIN(__m256i a, __m256i b) \
796 return _mm256_set_m128i( \
797 _mm_##INTRIN(_mm256_extractf128_si256(a, 1), \
798 _mm256_extractf128_si256(b, 1)), \
799 _mm_##INTRIN(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))); \
803SIMDVEC_INTEL_X_INT_BINFCT_32(unpacklo_epi8)
804SIMDVEC_INTEL_X_INT_BINFCT_32(unpackhi_epi8)
805SIMDVEC_INTEL_X_INT_BINFCT_32(unpacklo_epi16)
806SIMDVEC_INTEL_X_INT_BINFCT_32(unpackhi_epi16)
807SIMDVEC_INTEL_X_INT_BINFCT_32(shuffle_epi8)
808SIMDVEC_INTEL_X_INT_BINFCT_32(packs_epi16)
809SIMDVEC_INTEL_X_INT_BINFCT_32(packs_epi32)
810SIMDVEC_INTEL_X_INT_BINFCT_32(packus_epi16)
811SIMDVEC_INTEL_X_INT_BINFCT_32(packus_epi32)
812SIMDVEC_INTEL_X_INT_BINFCT_32(hadd_epi16)
813SIMDVEC_INTEL_X_INT_BINFCT_32(hadd_epi32)
814SIMDVEC_INTEL_X_INT_BINFCT_32(hadds_epi16)
815SIMDVEC_INTEL_X_INT_BINFCT_32(hsub_epi16)
816SIMDVEC_INTEL_X_INT_BINFCT_32(hsub_epi32)
817SIMDVEC_INTEL_X_INT_BINFCT_32(hsubs_epi16)
822#define SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(INTRIN, INTSUFFIX, PSPDSUFFIX) \
823 static SIMD_INLINE __m256i x_mm256_##INTRIN##_##INTSUFFIX(__m256i a, \
826 return _mm256_##INTRIN##_##INTSUFFIX(a, b); \
830#define SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(INTRIN, INTSUFFIX, PSPDSUFFIX) \
831 static SIMD_INLINE __m256i x_mm256_##INTRIN##_##INTSUFFIX(__m256i a, \
834 return _mm256_cast##PSPDSUFFIX##_si256( \
835 _mm256_##INTRIN##_##PSPDSUFFIX(_mm256_castsi256##_##PSPDSUFFIX(a), \
836 _mm256_castsi256##_##PSPDSUFFIX(b))); \
841SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpacklo, epi32, ps)
842SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpackhi, epi32, ps)
843SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpacklo, epi64, pd)
844SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpackhi, epi64, pd)
861template <
typename Tdst,
typename Tsrc,
862 SIMD_ENABLE_IF((!std::is_same<Tdst, Tsrc>::value &&
863 std::is_integral<Tdst>::value &&
864 std::is_integral<Tsrc>::value))>
865static SIMD_INLINE Vec<Tdst, 32> reinterpret(
const Vec<Tsrc, 32> &vec,
871 return Vec<Tdst, 32>(__m256i(vec));
875template <
typename Tdst, SIMD_ENABLE_IF((std::is_
integral<Tdst>::value))>
876static SIMD_INLINE Vec<Tdst, 32> reinterpret(
const Vec<Float, 32> &vec,
879 return _mm256_castps_si256(vec);
883template <
typename Tsrc, SIMD_ENABLE_IF((std::is_
integral<Tsrc>::value))>
884static SIMD_INLINE Vec<Float, 32> reinterpret(
const Vec<Tsrc, 32> &vec,
887 return _mm256_castsi256_ps(vec);
891template <
typename Tdst, SIMD_ENABLE_IF((std::is_
integral<Tdst>::value))>
892static SIMD_INLINE Vec<Tdst, 32> reinterpret(
const Vec<Double, 32> &vec,
895 return _mm256_castpd_si256(vec);
899template <
typename Tsrc, SIMD_ENABLE_IF((std::is_
integral<Tsrc>::value))>
900static SIMD_INLINE Vec<Double, 32> reinterpret(
const Vec<Tsrc, 32> &vec,
903 return _mm256_castsi256_pd(vec);
907static SIMD_INLINE Vec<Double, 32> reinterpret(
const Vec<Float, 32> &vec,
910 return _mm256_castps_pd(vec);
914static SIMD_INLINE Vec<Float, 32> reinterpret(
const Vec<Double, 32> &vec,
917 return _mm256_castpd_ps(vec);
922static SIMD_INLINE Vec<T, 32> reinterpret(
const Vec<T, 32> &vec, OutputType<T>)
938static SIMD_INLINE Vec<Int, 32> cvts(
const Vec<Float, 32> &a, OutputType<Int>)
943 __m256 clip = _mm256_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
944 return _mm256_cvtps_epi32(_mm256_min_ps(clip, a));
948static SIMD_INLINE Vec<Float, 32> cvts(
const Vec<Int, 32> &a, OutputType<Float>)
950 return _mm256_cvtepi32_ps(a);
953static SIMD_INLINE Vec<Long, 32> cvts(
const Vec<Double, 32> &a,
958 Double tmpD[4] SIMD_ATTR_ALIGNED(32);
959 _mm256_store_pd(tmpD, a);
960 Long tmpL[4] SIMD_ATTR_ALIGNED(32);
961 for (
int i = 0; i < 4; ++i) {
963 Long(std::rint(std::min(tmpD[i], MAX_POS_DOUBLE_CONVERTIBLE_TO_INT64)));
965 return _mm256_load_si256((__m256i *) tmpL);
968static SIMD_INLINE Vec<Double, 32> cvts(
const Vec<Long, 32> &a,
973 __m256i xH = _mm256_srai_epi32(a, 16);
974 xH = _mm256_and_si256(xH, _mm256_set1_epi64x(0xffffffff00000000));
975 xH = _mm256_add_epi64(
976 xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.)));
977 __m256i xL = _mm256_blend_epi16(
978 a, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0x88);
980 _mm256_sub_pd(_mm256_castsi256_pd(xH),
981 _mm256_set1_pd(442726361368656609280.));
982 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
985 return Vec<Double, 32>(cvts(a.lo(), OutputType<Double>()),
986 cvts(a.hi(), OutputType<Double>()));
994template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
995static SIMD_INLINE Vec<T, 32> setzero(OutputType<T>, Integer<32>)
997 return _mm256_setzero_si256();
1000static SIMD_INLINE Vec<Float, 32> setzero(OutputType<Float>, Integer<32>)
1002 return _mm256_setzero_ps();
1005static SIMD_INLINE Vec<Double, 32> setzero(OutputType<Double>, Integer<32>)
1007 return _mm256_setzero_pd();
1014static SIMD_INLINE Vec<Byte, 32> set1(Byte a, Integer<32>)
1016 return _mm256_set1_epi8(a);
1019static SIMD_INLINE Vec<SignedByte, 32> set1(SignedByte a, Integer<32>)
1021 return _mm256_set1_epi8(a);
1024static SIMD_INLINE Vec<Word, 32> set1(Word a, Integer<32>)
1026 return _mm256_set1_epi16(a);
1029static SIMD_INLINE Vec<Short, 32> set1(Short a, Integer<32>)
1031 return _mm256_set1_epi16(a);
1034static SIMD_INLINE Vec<Int, 32> set1(Int a, Integer<32>)
1036 return _mm256_set1_epi32(a);
1039static SIMD_INLINE Vec<Long, 32> set1(Long a, Integer<32>)
1041 return _mm256_set1_epi64x(a);
1044static SIMD_INLINE Vec<Float, 32> set1(Float a, Integer<32>)
1046 return _mm256_set1_ps(a);
1049static SIMD_INLINE Vec<Double, 32> set1(Double a, Integer<32>)
1051 return _mm256_set1_pd(a);
1058template <
typename T>
1059static SIMD_INLINE Vec<T, 32> load(
const T *
const p, Integer<32>)
1063 SIMD_CHECK_ALIGNMENT(p, 32);
1064 return _mm256_load_si256((__m256i *) p);
1067static SIMD_INLINE Vec<Float, 32> load(
const Float *
const p, Integer<32>)
1071 SIMD_CHECK_ALIGNMENT(p, 32);
1072 return _mm256_load_ps(p);
1075static SIMD_INLINE Vec<Double, 32> load(
const Double *
const p, Integer<32>)
1079 SIMD_CHECK_ALIGNMENT(p, 32);
1080 return _mm256_load_pd(p);
1087template <
typename T>
1088static SIMD_INLINE Vec<T, 32> loadu(
const T *
const p, Integer<32>)
1090 return _mm256_loadu_si256((__m256i *) p);
1093static SIMD_INLINE Vec<Float, 32> loadu(
const Float *
const p, Integer<32>)
1095 return _mm256_loadu_ps(p);
1098static SIMD_INLINE Vec<Double, 32> loadu(
const Double *
const p, Integer<32>)
1100 return _mm256_loadu_pd(p);
1108template <
typename T>
1109static SIMD_INLINE
void store(T *
const p,
const Vec<T, 32> &a)
1113 SIMD_CHECK_ALIGNMENT(p, 32);
1114 _mm256_store_si256((__m256i *) p, a);
1118static SIMD_INLINE
void store(Float *
const p,
const Vec<Float, 32> &a)
1122 SIMD_CHECK_ALIGNMENT(p, 32);
1123 _mm256_store_ps(p, a);
1127static SIMD_INLINE
void store(Double *
const p,
const Vec<Double, 32> &a)
1131 SIMD_CHECK_ALIGNMENT(p, 32);
1132 _mm256_store_pd(p, a);
1140template <
typename T>
1141static SIMD_INLINE
void storeu(T *
const p,
const Vec<T, 32> &a)
1143 _mm256_storeu_si256((__m256i *) p, a);
1147static SIMD_INLINE
void storeu(Float *
const p,
const Vec<Float, 32> &a)
1149 _mm256_storeu_ps(p, a);
1153static SIMD_INLINE
void storeu(Double *
const p,
const Vec<Double, 32> &a)
1155 _mm256_storeu_pd(p, a);
1163template <
typename T>
1164static SIMD_INLINE
void stream_store(T *
const p,
const Vec<T, 32> &a)
1168 SIMD_CHECK_ALIGNMENT(p, 32);
1169 _mm256_stream_si256((__m256i *) p, a);
1173static SIMD_INLINE
void stream_store(Float *
const p,
const Vec<Float, 32> &a)
1177 SIMD_CHECK_ALIGNMENT(p, 32);
1178 _mm256_stream_ps(p, a);
1182static SIMD_INLINE
void stream_store(Double *
const p,
const Vec<Double, 32> &a)
1186 SIMD_CHECK_ALIGNMENT(p, 32);
1187 _mm256_stream_pd(p, a);
1194template <
size_t COUNT>
1195static SIMD_INLINE
Byte extract(
const Vec<Byte, 32> &a)
1197 SIMD_IF_CONSTEXPR (COUNT < 32) {
1200 return _mm256_extract_epi8(a, COUNT);
1206template <
size_t COUNT>
1207static SIMD_INLINE
SignedByte extract(
const Vec<SignedByte, 32> &a)
1209 return ::simd::internal::bit_cast<SignedByte>(
1210 extract<COUNT>(reinterpret(a, OutputType<Byte>())));
1213template <
size_t COUNT>
1214static SIMD_INLINE
Word extract(
const Vec<Word, 32> &a)
1216 SIMD_IF_CONSTEXPR (COUNT < 16) {
1219 return _mm256_extract_epi16(a, COUNT);
1225template <
size_t COUNT>
1226static SIMD_INLINE
Short extract(
const Vec<Short, 32> &a)
1228 return ::simd::internal::bit_cast<Short>(
1229 extract<COUNT>(reinterpret(a, OutputType<Word>())));
1232template <
size_t COUNT>
1233static SIMD_INLINE
Int extract(
const Vec<Int, 32> &a)
1235 SIMD_IF_CONSTEXPR (COUNT < 8) {
1236 return _mm256_extract_epi32(a, COUNT);
1242template <
size_t COUNT>
1243static SIMD_INLINE
Long extract(
const Vec<Long, 32> &a)
1245 SIMD_IF_CONSTEXPR (COUNT < 4) {
1246 return _mm256_extract_epi64(a, COUNT);
1252template <
size_t COUNT>
1253static SIMD_INLINE
Float extract(
const Vec<Float, 32> &a)
1255 return ::simd::internal::bit_cast<Float>(
1256 extract<COUNT>(reinterpret(a, OutputType<Int>())));
1259template <
size_t COUNT>
1260static SIMD_INLINE
Double extract(
const Vec<Double, 32> &a)
1262 SIMD_IF_CONSTEXPR (COUNT < 4) {
1263 return ::simd::internal::bit_cast<Double>(
1264 _mm256_extract_epi64(_mm256_castpd_si256(a), COUNT));
1276static SIMD_INLINE Vec<Byte, 32> add(
const Vec<Byte, 32> &a,
1277 const Vec<Byte, 32> &b)
1279 return _mm256_add_epi8(a, b);
1282static SIMD_INLINE Vec<SignedByte, 32> add(
const Vec<SignedByte, 32> &a,
1283 const Vec<SignedByte, 32> &b)
1285 return _mm256_add_epi8(a, b);
1288static SIMD_INLINE Vec<Word, 32> add(
const Vec<Word, 32> &a,
1289 const Vec<Word, 32> &b)
1291 return _mm256_add_epi16(a, b);
1294static SIMD_INLINE Vec<Short, 32> add(
const Vec<Short, 32> &a,
1295 const Vec<Short, 32> &b)
1297 return _mm256_add_epi16(a, b);
1300static SIMD_INLINE Vec<Int, 32> add(
const Vec<Int, 32> &a,
1301 const Vec<Int, 32> &b)
1303 return _mm256_add_epi32(a, b);
1306static SIMD_INLINE Vec<Long, 32> add(
const Vec<Long, 32> &a,
1307 const Vec<Long, 32> &b)
1309 return _mm256_add_epi64(a, b);
1315template <
typename T>
1316static SIMD_INLINE Vec<T, 32> add(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
1318 return Vec<T, 32>(add(a.lo(), b.lo()), add(a.hi(), b.hi()));
1323static SIMD_INLINE Vec<Float, 32> add(
const Vec<Float, 32> &a,
1324 const Vec<Float, 32> &b)
1326 return _mm256_add_ps(a, b);
1329static SIMD_INLINE Vec<Double, 32> add(
const Vec<Double, 32> &a,
1330 const Vec<Double, 32> &b)
1332 return _mm256_add_pd(a, b);
1341static SIMD_INLINE Vec<Byte, 32> adds(
const Vec<Byte, 32> &a,
1342 const Vec<Byte, 32> &b)
1344 return _mm256_adds_epu8(a, b);
1347static SIMD_INLINE Vec<SignedByte, 32> adds(
const Vec<SignedByte, 32> &a,
1348 const Vec<SignedByte, 32> &b)
1350 return _mm256_adds_epi8(a, b);
1353static SIMD_INLINE Vec<Word, 32> adds(
const Vec<Word, 32> &a,
1354 const Vec<Word, 32> &b)
1356 return _mm256_adds_epu16(a, b);
1359static SIMD_INLINE Vec<Short, 32> adds(
const Vec<Short, 32> &a,
1360 const Vec<Short, 32> &b)
1362 return _mm256_adds_epi16(a, b);
1365static SIMD_INLINE Vec<Int, 32> adds(
const Vec<Int, 32> &a,
1366 const Vec<Int, 32> &b)
1375 __m256i sum = _mm256_add_epi32(a, b);
1376 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1377 __m256i sumHasDiffSign = _mm256_xor_si256(a, sum);
1380 _mm256_srai_epi32(_mm256_andnot_si256(opsHaveDiffSign, sumHasDiffSign), 31);
1384 __m256i saturatedSum =
1385 _mm256_xor_si256(_mm256_srai_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF));
1387 return _mm256_or_si256(_mm256_andnot_si256(overflow, sum),
1388 _mm256_and_si256(overflow, saturatedSum));
1391static SIMD_INLINE Vec<Long, 32> adds(
const Vec<Long, 32> &a,
1392 const Vec<Long, 32> &b)
1398 __m256i sum = _mm256_add_epi64(a, b);
1399 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1400 __m256i sumHasDiffSign = _mm256_xor_si256(a, sum);
1402 __m256i overflow32 =
1403 _mm256_srai_epi32(_mm256_andnot_si256(opsHaveDiffSign, sumHasDiffSign), 31);
1405 __m256i overflow = _mm256_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
1409 __m256i saturatedSum = _mm256_xor_si256(
1410 _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)),
1411 _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
1413 return _mm256_or_si256(_mm256_andnot_si256(overflow, sum),
1414 _mm256_and_si256(overflow, saturatedSum));
1420template <
typename T>
1421static SIMD_INLINE Vec<T, 32> adds(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
1423 return Vec<T, 32>(adds(a.lo(), b.lo()), adds(a.hi(), b.hi()));
1429static SIMD_INLINE Vec<Float, 32> adds(
const Vec<Float, 32> &a,
1430 const Vec<Float, 32> &b)
1432 return _mm256_add_ps(a, b);
1436static SIMD_INLINE Vec<Double, 32> adds(
const Vec<Double, 32> &a,
1437 const Vec<Double, 32> &b)
1439 return _mm256_add_pd(a, b);
1448static SIMD_INLINE Vec<Byte, 32> sub(
const Vec<Byte, 32> &a,
1449 const Vec<Byte, 32> &b)
1451 return _mm256_sub_epi8(a, b);
1454static SIMD_INLINE Vec<SignedByte, 32> sub(
const Vec<SignedByte, 32> &a,
1455 const Vec<SignedByte, 32> &b)
1457 return _mm256_sub_epi8(a, b);
1460static SIMD_INLINE Vec<Word, 32> sub(
const Vec<Word, 32> &a,
1461 const Vec<Word, 32> &b)
1463 return _mm256_sub_epi16(a, b);
1466static SIMD_INLINE Vec<Short, 32> sub(
const Vec<Short, 32> &a,
1467 const Vec<Short, 32> &b)
1469 return _mm256_sub_epi16(a, b);
1472static SIMD_INLINE Vec<Int, 32> sub(
const Vec<Int, 32> &a,
1473 const Vec<Int, 32> &b)
1475 return _mm256_sub_epi32(a, b);
1478static SIMD_INLINE Vec<Long, 32> sub(
const Vec<Long, 32> &a,
1479 const Vec<Long, 32> &b)
1481 return _mm256_sub_epi64(a, b);
1487template <
typename T>
1488static SIMD_INLINE Vec<T, 32> sub(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
1490 return Vec<T, 32>(sub(a.lo(), b.lo()), sub(a.hi(), b.hi()));
1495static SIMD_INLINE Vec<Float, 32> sub(
const Vec<Float, 32> &a,
1496 const Vec<Float, 32> &b)
1498 return _mm256_sub_ps(a, b);
1501static SIMD_INLINE Vec<Double, 32> sub(
const Vec<Double, 32> &a,
1502 const Vec<Double, 32> &b)
1504 return _mm256_sub_pd(a, b);
1513static SIMD_INLINE Vec<Byte, 32> subs(
const Vec<Byte, 32> &a,
1514 const Vec<Byte, 32> &b)
1516 return _mm256_subs_epu8(a, b);
1519static SIMD_INLINE Vec<SignedByte, 32> subs(
const Vec<SignedByte, 32> &a,
1520 const Vec<SignedByte, 32> &b)
1522 return _mm256_subs_epi8(a, b);
1525static SIMD_INLINE Vec<Word, 32> subs(
const Vec<Word, 32> &a,
1526 const Vec<Word, 32> &b)
1528 return _mm256_subs_epu16(a, b);
1531static SIMD_INLINE Vec<Short, 32> subs(
const Vec<Short, 32> &a,
1532 const Vec<Short, 32> &b)
1534 return _mm256_subs_epi16(a, b);
1537static SIMD_INLINE Vec<Int, 32> subs(
const Vec<Int, 32> &a,
1538 const Vec<Int, 32> &b)
1547 __m256i diff = _mm256_sub_epi32(a, b);
1548 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1549 __m256i diffHasDiffSign = _mm256_xor_si256(a, diff);
1552 _mm256_srai_epi32(_mm256_and_si256(opsHaveDiffSign, diffHasDiffSign), 31);
1556 __m256i saturatedDiff =
1557 _mm256_xor_si256(_mm256_srai_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF));
1559 return _mm256_or_si256(_mm256_andnot_si256(overflow, diff),
1560 _mm256_and_si256(overflow, saturatedDiff));
1563static SIMD_INLINE Vec<Long, 32> subs(
const Vec<Long, 32> &a,
1564 const Vec<Long, 32> &b)
1570 __m256i diff = _mm256_sub_epi64(a, b);
1571 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1572 __m256i diffHasDiffSign = _mm256_xor_si256(a, diff);
1574 __m256i overflow32 =
1575 _mm256_srai_epi32(_mm256_and_si256(opsHaveDiffSign, diffHasDiffSign), 31);
1577 __m256i overflow = _mm256_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
1581 __m256i saturatedDiff = _mm256_xor_si256(
1582 _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)),
1583 _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
1585 return _mm256_or_si256(_mm256_andnot_si256(overflow, diff),
1586 _mm256_and_si256(overflow, saturatedDiff));
1592template <
typename T>
1593static SIMD_INLINE Vec<T, 32> subs(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
1595 return Vec<T, 32>(subs(a.lo(), b.lo()), subs(a.hi(), b.hi()));
1601static SIMD_INLINE Vec<Float, 32> subs(
const Vec<Float, 32> &a,
1602 const Vec<Float, 32> &b)
1604 return _mm256_sub_ps(a, b);
1608static SIMD_INLINE Vec<Double, 32> subs(
const Vec<Double, 32> &a,
1609 const Vec<Double, 32> &b)
1611 return _mm256_sub_pd(a, b);
1620static SIMD_INLINE Vec<SignedByte, 32> neg(
const Vec<SignedByte, 32> &a)
1622 return _mm256_sub_epi8(_mm256_setzero_si256(), a);
1625static SIMD_INLINE Vec<Short, 32> neg(
const Vec<Short, 32> &a)
1627 return _mm256_sub_epi16(_mm256_setzero_si256(), a);
1630static SIMD_INLINE Vec<Int, 32> neg(
const Vec<Int, 32> &a)
1632 return _mm256_sub_epi32(_mm256_setzero_si256(), a);
1635static SIMD_INLINE Vec<Long, 32> neg(
const Vec<Long, 32> &a)
1637 return _mm256_sub_epi64(_mm256_setzero_si256(), a);
1643template <
typename T>
1644static SIMD_INLINE Vec<T, 32> neg(
const Vec<T, 32> &a)
1646 return Vec<T, 32>(neg(a.lo()), neg(a.hi()));
1651static SIMD_INLINE Vec<Float, 32> neg(
const Vec<Float, 32> &a)
1653 return _mm256_sub_ps(_mm256_setzero_ps(), a);
1656static SIMD_INLINE Vec<Double, 32> neg(
const Vec<Double, 32> &a)
1658 return _mm256_sub_pd(_mm256_setzero_pd(), a);
1667static SIMD_INLINE Vec<Byte, 32> min(
const Vec<Byte, 32> &a,
1668 const Vec<Byte, 32> &b)
1670 return _mm256_min_epu8(a, b);
1673static SIMD_INLINE Vec<SignedByte, 32> min(
const Vec<SignedByte, 32> &a,
1674 const Vec<SignedByte, 32> &b)
1676 return _mm256_min_epi8(a, b);
1679static SIMD_INLINE Vec<Word, 32> min(
const Vec<Word, 32> &a,
1680 const Vec<Word, 32> &b)
1682 return _mm256_min_epu16(a, b);
1685static SIMD_INLINE Vec<Short, 32> min(
const Vec<Short, 32> &a,
1686 const Vec<Short, 32> &b)
1688 return _mm256_min_epi16(a, b);
1691static SIMD_INLINE Vec<Int, 32> min(
const Vec<Int, 32> &a,
1692 const Vec<Int, 32> &b)
1694 return _mm256_min_epi32(a, b);
1700static SIMD_INLINE Vec<Long, 32> min(
const Vec<Long, 32> &a,
1701 const Vec<Long, 32> &b)
1704 const __m256i diff = _mm256_sub_epi64(b, a);
1706 const __m256i res = _mm256_xor_si256(
1707 diff, _mm256_and_si256(_mm256_xor_si256(b, a), _mm256_xor_si256(diff, b)));
1710 _mm256_or_si256(_mm256_andnot_si256(a, b),
1711 _mm256_andnot_si256(_mm256_xor_si256(b, a), diff));
1715 const __m256i spread32 = _mm256_srai_epi32(res, 31);
1716 const __m256i gt = _mm256_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1719 return _mm256_blendv_epi8(a, b, gt);
1725template <
typename T>
1726static SIMD_INLINE Vec<T, 32> min(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
1728 return Vec<T, 32>(min(a.lo(), b.lo()), min(a.hi(), b.hi()));
1733static SIMD_INLINE Vec<Float, 32> min(
const Vec<Float, 32> &a,
1734 const Vec<Float, 32> &b)
1736 return _mm256_min_ps(a, b);
1739static SIMD_INLINE Vec<Double, 32> min(
const Vec<Double, 32> &a,
1740 const Vec<Double, 32> &b)
1742 return _mm256_min_pd(a, b);
1751static SIMD_INLINE Vec<Byte, 32> max(
const Vec<Byte, 32> &a,
1752 const Vec<Byte, 32> &b)
1754 return _mm256_max_epu8(a, b);
1757static SIMD_INLINE Vec<SignedByte, 32> max(
const Vec<SignedByte, 32> &a,
1758 const Vec<SignedByte, 32> &b)
1760 return _mm256_max_epi8(a, b);
1763static SIMD_INLINE Vec<Word, 32> max(
const Vec<Word, 32> &a,
1764 const Vec<Word, 32> &b)
1766 return _mm256_max_epu16(a, b);
1769static SIMD_INLINE Vec<Short, 32> max(
const Vec<Short, 32> &a,
1770 const Vec<Short, 32> &b)
1772 return _mm256_max_epi16(a, b);
1775static SIMD_INLINE Vec<Int, 32> max(
const Vec<Int, 32> &a,
1776 const Vec<Int, 32> &b)
1778 return _mm256_max_epi32(a, b);
1784static SIMD_INLINE Vec<Long, 32> max(
const Vec<Long, 32> &a,
1785 const Vec<Long, 32> &b)
1788 const __m256i diff = _mm256_sub_epi64(b, a);
1790 const __m256i res = _mm256_xor_si256(
1791 diff, _mm256_and_si256(_mm256_xor_si256(b, a), _mm256_xor_si256(diff, b)));
1794 _mm256_or_si256(_mm256_andnot_si256(a, b),
1795 _mm256_andnot_si256(_mm256_xor_si256(b, a), diff));
1799 const __m256i spread32 = _mm256_srai_epi32(res, 31);
1800 const __m256i gt = _mm256_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1803 return _mm256_blendv_epi8(b, a, gt);
1809template <
typename T>
1810static SIMD_INLINE Vec<T, 32> max(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
1812 return Vec<T, 32>(max(a.lo(), b.lo()), max(a.hi(), b.hi()));
1817static SIMD_INLINE Vec<Float, 32> max(
const Vec<Float, 32> &a,
1818 const Vec<Float, 32> &b)
1820 return _mm256_max_ps(a, b);
1823static SIMD_INLINE Vec<Double, 32> max(
const Vec<Double, 32> &a,
1824 const Vec<Double, 32> &b)
1826 return _mm256_max_pd(a, b);
1836static SIMD_INLINE Vec<Float, 32> mul(
const Vec<Float, 32> &a,
1837 const Vec<Float, 32> &b)
1839 return _mm256_mul_ps(a, b);
1842static SIMD_INLINE Vec<Double, 32> mul(
const Vec<Double, 32> &a,
1843 const Vec<Double, 32> &b)
1845 return _mm256_mul_pd(a, b);
1848static SIMD_INLINE Vec<Float, 32> div(
const Vec<Float, 32> &a,
1849 const Vec<Float, 32> &b)
1851 return _mm256_div_ps(a, b);
1854static SIMD_INLINE Vec<Double, 32> div(
const Vec<Double, 32> &a,
1855 const Vec<Double, 32> &b)
1857 return _mm256_div_pd(a, b);
1868template <
typename T>
1869static SIMD_INLINE Vec<T, 32> ceil(
const Vec<T, 32> &a)
1871 static_assert(std::is_integral<T>::value,
"");
1875template <
typename T>
1876static SIMD_INLINE Vec<T, 32> floor(
const Vec<T, 32> &a)
1878 static_assert(std::is_integral<T>::value,
"");
1882template <
typename T>
1883static SIMD_INLINE Vec<T, 32> round(
const Vec<T, 32> &a)
1885 static_assert(std::is_integral<T>::value,
"");
1889template <
typename T>
1890static SIMD_INLINE Vec<T, 32> truncate(
const Vec<T, 32> &a)
1892 static_assert(std::is_integral<T>::value,
"");
1896static SIMD_INLINE Vec<Float, 32> ceil(
const Vec<Float, 32> &a)
1898 return _mm256_ceil_ps(a);
1901static SIMD_INLINE Vec<Double, 32> ceil(
const Vec<Double, 32> &a)
1903 return _mm256_ceil_pd(a);
1906static SIMD_INLINE Vec<Float, 32> floor(
const Vec<Float, 32> &a)
1908 return _mm256_floor_ps(a);
1911static SIMD_INLINE Vec<Double, 32> floor(
const Vec<Double, 32> &a)
1913 return _mm256_floor_pd(a);
1916static SIMD_INLINE Vec<Float, 32> round(
const Vec<Float, 32> &a)
1921 return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1924static SIMD_INLINE Vec<Double, 32> round(
const Vec<Double, 32> &a)
1926 return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1929static SIMD_INLINE Vec<Float, 32> truncate(
const Vec<Float, 32> &a)
1931 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1934static SIMD_INLINE Vec<Double, 32> truncate(
const Vec<Double, 32> &a)
1936 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1944static SIMD_INLINE Vec<Float, 32> rcp(
const Vec<Float, 32> &a)
1946 return _mm256_rcp_ps(a);
1949static SIMD_INLINE Vec<Double, 32> rcp(
const Vec<Double, 32> &a)
1952 return Vec<Double, 32>(rcp(a.lo()), rcp(a.hi()));
1956static SIMD_INLINE Vec<Float, 32> rsqrt(
const Vec<Float, 32> &a)
1958 return _mm256_rsqrt_ps(a);
1961static SIMD_INLINE Vec<Double, 32> rsqrt(
const Vec<Double, 32> &a)
1964 return Vec<Double, 32>(rsqrt(a.lo()), rsqrt(a.hi()));
1968static SIMD_INLINE Vec<Float, 32> sqrt(
const Vec<Float, 32> &a)
1970 return _mm256_sqrt_ps(a);
1973static SIMD_INLINE Vec<Double, 32> sqrt(
const Vec<Double, 32> &a)
1975 return _mm256_sqrt_pd(a);
1985template <
typename T, SIMD_ENABLE_IF(std::is_
unsigned<T>::value
1986 &&std::is_
integral<T>::value)>
1987static SIMD_INLINE Vec<T, 32> abs(
const Vec<T, 32> &a)
1992static SIMD_INLINE Vec<SignedByte, 32> abs(
const Vec<SignedByte, 32> &a)
1995 return _mm256_abs_epi8(a);
1998 return Vec<SignedByte, 32>(abs(a.lo()), abs(a.hi()));
2002static SIMD_INLINE Vec<Short, 32> abs(
const Vec<Short, 32> &a)
2005 return _mm256_abs_epi16(a);
2008 return Vec<Short, 32>(abs(a.lo()), abs(a.hi()));
2012static SIMD_INLINE Vec<Int, 32> abs(
const Vec<Int, 32> &a)
2015 return _mm256_abs_epi32(a);
2018 return Vec<Int, 32>(abs(a.lo()), abs(a.hi()));
2022static SIMD_INLINE Vec<Long, 32> abs(
const Vec<Long, 32> &a)
2027 const __m256i signMask =
2028 _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1));
2029 return _mm256_sub_epi64(_mm256_xor_si256(a, signMask), signMask);
2032 return Vec<Long, 32>(abs(a.lo()), abs(a.hi()));
2036static SIMD_INLINE Vec<Float, 32> abs(
const Vec<Float, 32> &a)
2040 return _mm256_andnot_ps(_mm256_set1_ps(-0.0F), a);
2043static SIMD_INLINE Vec<Double, 32> abs(
const Vec<Double, 32> &a)
2048 return _mm256_andnot_pd(_mm256_set1_pd(-0.0), a);
2056template <
typename T>
2057static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2060 return x_mm256_unpacklo_epi8(x_mm256_transpose4x64_epi64(a),
2061 x_mm256_transpose4x64_epi64(b));
2065template <
typename T>
2066static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2069 return x_mm256_unpacklo_epi16(x_mm256_transpose4x64_epi64(a),
2070 x_mm256_transpose4x64_epi64(b));
2074template <
typename T>
2075static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2078 return x_mm256_unpacklo_epi32(x_mm256_transpose4x64_epi64(a),
2079 x_mm256_transpose4x64_epi64(b));
2083template <
typename T>
2084static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2087 return x_mm256_unpacklo_epi64(x_mm256_transpose4x64_epi64(a),
2088 x_mm256_transpose4x64_epi64(b));
2092template <
typename T>
2093static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2096 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2100static SIMD_INLINE Vec<Float, 32> unpack(
const Vec<Float, 32> &a,
2101 const Vec<Float, 32> &b, Part<0>,
2104 return _mm256_unpacklo_ps(x_mm256_transpose4x64_ps(a),
2105 x_mm256_transpose4x64_ps(b));
2109static SIMD_INLINE Vec<Float, 32> unpack(
const Vec<Float, 32> &a,
2110 const Vec<Float, 32> &b, Part<0>,
2113 return x_mm256_unpacklo_2ps(x_mm256_transpose4x64_ps(a),
2114 x_mm256_transpose4x64_ps(b));
2118static SIMD_INLINE Vec<Float, 32> unpack(
const Vec<Float, 32> &a,
2119 const Vec<Float, 32> &b, Part<0>,
2122 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2126static SIMD_INLINE Vec<Double, 32> unpack(
const Vec<Double, 32> &a,
2127 const Vec<Double, 32> &b, Part<0>,
2130 return _mm256_unpacklo_pd(x_mm256_transpose4x64_pd(a),
2131 x_mm256_transpose4x64_pd(b));
2135static SIMD_INLINE Vec<Double, 32> unpack(
const Vec<Double, 32> &a,
2136 const Vec<Double, 32> &b, Part<0>,
2139 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2147template <
typename T>
2148static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2151 return x_mm256_unpackhi_epi8(x_mm256_transpose4x64_epi64(a),
2152 x_mm256_transpose4x64_epi64(b));
2156template <
typename T>
2157static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2160 return x_mm256_unpackhi_epi16(x_mm256_transpose4x64_epi64(a),
2161 x_mm256_transpose4x64_epi64(b));
2165template <
typename T>
2166static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2169 return x_mm256_unpackhi_epi32(x_mm256_transpose4x64_epi64(a),
2170 x_mm256_transpose4x64_epi64(b));
2174template <
typename T>
2175static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2178 return x_mm256_unpackhi_epi64(x_mm256_transpose4x64_epi64(a),
2179 x_mm256_transpose4x64_epi64(b));
2183template <
typename T>
2184static SIMD_INLINE Vec<T, 32> unpack(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2187 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2191static SIMD_INLINE Vec<Float, 32> unpack(
const Vec<Float, 32> &a,
2192 const Vec<Float, 32> &b, Part<1>,
2195 return _mm256_unpackhi_ps(x_mm256_transpose4x64_ps(a),
2196 x_mm256_transpose4x64_ps(b));
2200static SIMD_INLINE Vec<Float, 32> unpack(
const Vec<Float, 32> &a,
2201 const Vec<Float, 32> &b, Part<1>,
2204 return x_mm256_unpackhi_2ps(x_mm256_transpose4x64_ps(a),
2205 x_mm256_transpose4x64_ps(b));
2209static SIMD_INLINE Vec<Float, 32> unpack(
const Vec<Float, 32> &a,
2210 const Vec<Float, 32> &b, Part<1>,
2213 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2217static SIMD_INLINE Vec<Double, 32> unpack(
const Vec<Double, 32> &a,
2218 const Vec<Double, 32> &b, Part<1>,
2221 return _mm256_unpackhi_pd(x_mm256_transpose4x64_pd(a),
2222 x_mm256_transpose4x64_pd(b));
2226static SIMD_INLINE Vec<Double, 32> unpack(
const Vec<Double, 32> &a,
2227 const Vec<Double, 32> &b, Part<1>,
2230 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2240template <
typename T>
2241static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2244 return x_mm256_unpacklo_epi8(a, b);
2248template <
typename T>
2249static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2252 return x_mm256_unpacklo_epi16(a, b);
2256template <
typename T>
2257static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2260 return x_mm256_unpacklo_epi32(a, b);
2264template <
typename T>
2265static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2268 return x_mm256_unpacklo_epi64(a, b);
2272template <
typename T>
2273static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2276 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2280static SIMD_INLINE Vec<Float, 32> unpack16(
const Vec<Float, 32> &a,
2281 const Vec<Float, 32> &b, Part<0>,
2284 return _mm256_unpacklo_ps(a, b);
2288static SIMD_INLINE Vec<Float, 32> unpack16(
const Vec<Float, 32> &a,
2289 const Vec<Float, 32> &b, Part<0>,
2292 return x_mm256_unpacklo_2ps(a, b);
2296static SIMD_INLINE Vec<Float, 32> unpack16(
const Vec<Float, 32> &a,
2297 const Vec<Float, 32> &b, Part<0>,
2300 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2304static SIMD_INLINE Vec<Double, 32> unpack16(
const Vec<Double, 32> &a,
2305 const Vec<Double, 32> &b, Part<0>,
2308 return _mm256_unpacklo_pd(a, b);
2312static SIMD_INLINE Vec<Double, 32> unpack16(
const Vec<Double, 32> &a,
2313 const Vec<Double, 32> &b, Part<0>,
2316 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2324template <
typename T>
2325static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2328 return x_mm256_unpackhi_epi8(a, b);
2332template <
typename T>
2333static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2336 return x_mm256_unpackhi_epi16(a, b);
2340template <
typename T>
2341static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2344 return x_mm256_unpackhi_epi32(a, b);
2348template <
typename T>
2349static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2352 return x_mm256_unpackhi_epi64(a, b);
2356template <
typename T>
2357static SIMD_INLINE Vec<T, 32> unpack16(
const Vec<T, 32> &a,
const Vec<T, 32> &b,
2360 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2364static SIMD_INLINE Vec<Float, 32> unpack16(
const Vec<Float, 32> &a,
2365 const Vec<Float, 32> &b, Part<1>,
2368 return _mm256_unpackhi_ps(a, b);
2372static SIMD_INLINE Vec<Float, 32> unpack16(
const Vec<Float, 32> &a,
2373 const Vec<Float, 32> &b, Part<1>,
2376 return x_mm256_unpackhi_2ps(a, b);
2380static SIMD_INLINE Vec<Float, 32> unpack16(
const Vec<Float, 32> &a,
2381 const Vec<Float, 32> &b, Part<1>,
2384 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2388static SIMD_INLINE Vec<Double, 32> unpack16(
const Vec<Double, 32> &a,
2389 const Vec<Double, 32> &b, Part<1>,
2392 return _mm256_unpackhi_pd(a, b);
2396static SIMD_INLINE Vec<Double, 32> unpack16(
const Vec<Double, 32> &a,
2397 const Vec<Double, 32> &b, Part<1>,
2400 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2412template <
size_t LANE_INDEX,
typename T>
2413static SIMD_INLINE Vec<T, 16> extractLane(
const Vec<T, 32> &a)
2415 const auto intA = reinterpret(a, OutputType<Int>());
2416 const Vec<Int, 16> intRes = _mm256_extractf128_si256(intA, LANE_INDEX);
2417 return reinterpret(intRes, OutputType<T>());
2432template <
typename T>
2433static SIMD_INLINE
void zip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2434 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<1>)
2436 __m256i at = x_mm256_transpose4x64_epi64(a);
2437 __m256i bt = x_mm256_transpose4x64_epi64(b);
2438 l = x_mm256_unpacklo_epi8(at, bt);
2439 h = x_mm256_unpackhi_epi8(at, bt);
2443template <
typename T>
2444static SIMD_INLINE
void zip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2445 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<2>)
2447 __m256i at = x_mm256_transpose4x64_epi64(a);
2448 __m256i bt = x_mm256_transpose4x64_epi64(b);
2449 l = x_mm256_unpacklo_epi16(at, bt);
2450 h = x_mm256_unpackhi_epi16(at, bt);
2454template <
typename T>
2455static SIMD_INLINE
void zip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2456 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<4>)
2458 __m256i at = x_mm256_transpose4x64_epi64(a);
2459 __m256i bt = x_mm256_transpose4x64_epi64(b);
2460 l = x_mm256_unpacklo_epi32(at, bt);
2461 h = x_mm256_unpackhi_epi32(at, bt);
2465template <
typename T>
2466static SIMD_INLINE
void zip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2467 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<8>)
2469 __m256i at = x_mm256_transpose4x64_epi64(a);
2470 __m256i bt = x_mm256_transpose4x64_epi64(b);
2471 l = x_mm256_unpacklo_epi64(at, bt);
2472 h = x_mm256_unpackhi_epi64(at, bt);
2476template <
typename T>
2477static SIMD_INLINE
void zip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2478 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<16>)
2480 l = _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2481 h = _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2485static SIMD_INLINE
void zip(
const Vec<Float, 32> a,
const Vec<Float, 32> b,
2486 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<4>)
2488 __m256 at = x_mm256_transpose4x64_ps(a);
2489 __m256 bt = x_mm256_transpose4x64_ps(b);
2490 l = _mm256_unpacklo_ps(at, bt);
2491 h = _mm256_unpackhi_ps(at, bt);
2495static SIMD_INLINE
void zip(
const Vec<Float, 32> a,
const Vec<Float, 32> b,
2496 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<8>)
2498 __m256 at = x_mm256_transpose4x64_ps(a);
2499 __m256 bt = x_mm256_transpose4x64_ps(b);
2500 l = x_mm256_unpacklo_2ps(at, bt);
2501 h = x_mm256_unpackhi_2ps(at, bt);
2505static SIMD_INLINE
void zip(
const Vec<Float, 32> a,
const Vec<Float, 32> b,
2506 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<16>)
2508 l = _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2509 h = _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2513static SIMD_INLINE
void zip(
const Vec<Double, 32> a,
const Vec<Double, 32> b,
2514 Vec<Double, 32> &l, Vec<Double, 32> &h, Bytes<8>)
2516 __m256d at = x_mm256_transpose4x64_pd(a);
2517 __m256d bt = x_mm256_transpose4x64_pd(b);
2518 l = _mm256_unpacklo_pd(at, bt);
2519 h = _mm256_unpackhi_pd(at, bt);
2523static SIMD_INLINE
void zip(
const Vec<Double, 32> a,
const Vec<Double, 32> b,
2524 Vec<Double, 32> &l, Vec<Double, 32> &h, Bytes<16>)
2526 l = _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2527 h = _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2535template <
size_t NUM_ELEMS,
typename T>
2536static SIMD_INLINE
void zip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2537 Vec<T, 32> &l, Vec<T, 32> &h)
2539 return zip(a, b, l, h, Bytes<NUM_ELEMS *
sizeof(T)>());
2549template <
size_t NUM_ELEMS,
typename T>
2550static SIMD_INLINE
void zip16(
const Vec<T, 32> a,
const Vec<T, 32> b,
2551 Vec<T, 32> &l, Vec<T, 32> &h)
2553 l = unpack16(a, b, Part<0>(), Bytes<NUM_ELEMS *
sizeof(T)>());
2554 h = unpack16(a, b, Part<1>(), Bytes<NUM_ELEMS *
sizeof(T)>());
2568template <
typename T>
2569static SIMD_INLINE
void unzip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2570 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<1>)
2573 const __m256i mask =
2574 _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15,
2575 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
2576 const __m256i atmp =
2577 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(a, mask));
2578 const __m256i btmp =
2579 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(b, mask));
2580 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2581 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2585template <
typename T>
2586static SIMD_INLINE
void unzip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2587 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<2>)
2590 const __m256i mask =
2591 _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15,
2592 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
2593 const __m256i atmp =
2594 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(a, mask));
2595 const __m256i btmp =
2596 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(b, mask));
2597 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2598 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2602template <
typename T>
2603static SIMD_INLINE
void unzip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2604 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<4>)
2607 const __m256i aShuffled = _mm256_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2608 const __m256i bShuffled = _mm256_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2610 const __m256i aShuffled = _mm256_castps_si256(_mm256_shuffle_ps(
2611 _mm256_castsi256_ps(a), _mm256_castsi256_ps(a), _MM_SHUFFLE(3, 1, 2, 0)));
2612 const __m256i bShuffled = _mm256_castps_si256(_mm256_shuffle_ps(
2613 _mm256_castsi256_ps(b), _mm256_castsi256_ps(b), _MM_SHUFFLE(3, 1, 2, 0)));
2615 const __m256i atmp = x_mm256_transpose4x64_epi64(aShuffled);
2616 const __m256i btmp = x_mm256_transpose4x64_epi64(bShuffled);
2617 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2618 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2622template <
typename T>
2623static SIMD_INLINE
void unzip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2624 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<8>)
2626 const __m256i atmp = x_mm256_transpose4x64_epi64(a);
2627 const __m256i btmp = x_mm256_transpose4x64_epi64(b);
2628 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2629 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2633template <
typename T>
2634static SIMD_INLINE
void unzip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2635 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<16>)
2637 l = unpack(a, b, Part<0>(), Bytes<16>());
2638 h = unpack(a, b, Part<1>(), Bytes<16>());
2642static SIMD_INLINE
void unzip(
const Vec<Float, 32> a,
const Vec<Float, 32> b,
2643 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<4>)
2646 x_mm256_transpose4x64_ps(_mm256_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 0)));
2648 x_mm256_transpose4x64_ps(_mm256_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 2, 0)));
2649 l = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2650 h = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2654static SIMD_INLINE
void unzip(
const Vec<Float, 32> a,
const Vec<Float, 32> b,
2655 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<8>)
2657 const __m256 atmp = x_mm256_transpose4x64_ps(a);
2658 const __m256 btmp = x_mm256_transpose4x64_ps(b);
2659 l = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2660 h = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2664static SIMD_INLINE
void unzip(
const Vec<Double, 32> a,
const Vec<Double, 32> b,
2665 Vec<Double, 32> &l, Vec<Double, 32> &h, Bytes<8>)
2667 const __m256d atmp = x_mm256_transpose4x64_pd(a);
2668 const __m256d btmp = x_mm256_transpose4x64_pd(b);
2669 l = _mm256_permute2f128_pd(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2670 h = _mm256_permute2f128_pd(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2678template <
size_t NUM_ELEMS,
typename T>
2679static SIMD_INLINE
void unzip(
const Vec<T, 32> a,
const Vec<T, 32> b,
2680 Vec<T, 32> &l, Vec<T, 32> &h)
2682 return unzip(a, b, l, h, Bytes<NUM_ELEMS *
sizeof(T)>());
2691static SIMD_INLINE Vec<SignedByte, 32> packs(
const Vec<Short, 32> &a,
2692 const Vec<Short, 32> &b,
2693 OutputType<SignedByte>)
2695 return x_mm256_transpose4x64_epi64(x_mm256_packs_epi16(a, b));
2698static SIMD_INLINE Vec<Short, 32> packs(
const Vec<Int, 32> &a,
2699 const Vec<Int, 32> &b,
2702 return x_mm256_transpose4x64_epi64(x_mm256_packs_epi32(a, b));
2705static SIMD_INLINE Vec<Short, 32> packs(
const Vec<Float, 32> &a,
2706 const Vec<Float, 32> &b,
2709 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2710 OutputType<Short>());
2713static SIMD_INLINE Vec<Float, 32> packs(
const Vec<Long, 32> &a,
2714 const Vec<Long, 32> &b,
2718 return _mm256_set_m128(_mm256_cvtpd_ps(cvts(b, OutputType<Double>())),
2719 _mm256_cvtpd_ps(cvts(a, OutputType<Double>())));
2722static SIMD_INLINE Vec<Int, 32> packs(
const Vec<Long, 32> &a,
2723 const Vec<Long, 32> &b, OutputType<Int>)
2728 const auto maxClip = _mm256_set1_epi64x(0x000000007fffffff);
2729 const auto minClip = _mm256_set1_epi64x(0xffffffff80000000);
2730 const auto aSaturatedMin =
2731 _mm256_blendv_epi8(a, minClip, _mm256_cmpgt_epi64(minClip, a));
2732 const auto aSaturated =
2733 _mm256_blendv_epi8(aSaturatedMin, maxClip, _mm256_cmpgt_epi64(a, maxClip));
2734 const auto bSaturatedMin =
2735 _mm256_blendv_epi8(b, minClip, _mm256_cmpgt_epi64(minClip, b));
2736 const auto bSaturated =
2737 _mm256_blendv_epi8(bSaturatedMin, maxClip, _mm256_cmpgt_epi64(b, maxClip));
2738 return x_mm256_transpose4x64_epi64(_mm256_castps_si256(_mm256_shuffle_ps(
2739 _mm256_castsi256_ps(aSaturated), _mm256_castsi256_ps(bSaturated),
2740 _MM_SHUFFLE(2, 0, 2, 0))));
2747 Long input[8] SIMD_ATTR_ALIGNED(32);
2748 _mm256_store_si256((__m256i *) input, a);
2749 _mm256_store_si256((__m256i *) (input + 4), b);
2750 Int output[8] SIMD_ATTR_ALIGNED(32);
2751 for (
int i = 0; i < 8; ++i) {
2753 (
Int) std::min(std::max(input[i], (Long) std::numeric_limits<Int>::min()),
2754 (Long) std::numeric_limits<Int>::max());
2756 return _mm256_load_si256((__m256i *) output);
2760static SIMD_INLINE Vec<Float, 32> packs(
const Vec<Double, 32> &a,
2761 const Vec<Double, 32> &b,
2764 return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
2767static SIMD_INLINE Vec<Int, 32> packs(
const Vec<Double, 32> &a,
2768 const Vec<Double, 32> &b, OutputType<Int>)
2770 const __m256d clip = _mm256_set1_pd(std::numeric_limits<Int>::max());
2771 return _mm256_set_m128i(_mm256_cvtpd_epi32(_mm256_min_pd(clip, b)),
2772 _mm256_cvtpd_epi32(_mm256_min_pd(clip, a)));
2777static SIMD_INLINE Vec<Byte, 32> packs(
const Vec<Word, 32> &a,
2778 const Vec<Word, 32> &b, OutputType<Byte>)
2783 return x_mm256_transpose4x64_epi64(
2784 _mm256_packus_epi16(_mm256_min_epu16(a, _mm256_set1_epi16(0xff)),
2785 _mm256_min_epu16(b, _mm256_set1_epi16(0xff))));
2787 return x_mm256_transpose4x64_epi64(
2788 Vec<Byte, 32>(packs(a.lo(), b.lo(), OutputType<Byte>()),
2789 packs(a.hi(), b.hi(), OutputType<Byte>())));
2796static SIMD_INLINE Vec<Byte, 32> packs(
const Vec<Short, 32> &a,
2797 const Vec<Short, 32> &b,
2800 return x_mm256_transpose4x64_epi64(x_mm256_packus_epi16(a, b));
2804static SIMD_INLINE Vec<Word, 32> packs(
const Vec<Int, 32> &a,
2805 const Vec<Int, 32> &b, OutputType<Word>)
2807 return x_mm256_transpose4x64_epi64(x_mm256_packus_epi32(a, b));
2810static SIMD_INLINE Vec<Word, 32> packs(
const Vec<Float, 32> &a,
2811 const Vec<Float, 32> &b,
2814 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2815 OutputType<Word>());
2819static SIMD_INLINE Vec<SignedByte, 32> packs(
const Vec<Word, 32> &a,
2820 const Vec<Word, 32> &b,
2821 OutputType<SignedByte>)
2826 return x_mm256_transpose4x64_epi64(
2827 _mm256_packs_epi16(_mm256_min_epu16(a, _mm256_set1_epi16(0x7f)),
2828 _mm256_min_epu16(b, _mm256_set1_epi16(0x7f))));
2830 return x_mm256_transpose4x64_epi64(
2831 Vec<SignedByte, 32>(packs(a.lo(), b.lo(), OutputType<SignedByte>()),
2832 packs(a.hi(), b.hi(), OutputType<SignedByte>())));
2851template <
typename T>
2852static SIMD_INLINE
void extend(
const Vec<T, 32> &vIn, Vec<T, 32> vOut[1])
2859static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
2860 Vec<Byte, 32> vOut[1])
2862 vOut[0] = max(vIn, Vec<SignedByte, 32>(_mm256_setzero_si256()));
2865static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn,
2866 Vec<SignedByte, 32> vOut[1])
2868 vOut[0] = min(vIn, Vec<Byte, 32>(_mm256_set1_epi8(0x7f)));
2871static SIMD_INLINE
void extend(
const Vec<Short, 32> &vIn, Vec<Word, 32> vOut[1])
2873 vOut[0] = max(vIn, Vec<Short, 32>(_mm256_setzero_si256()));
2876static SIMD_INLINE
void extend(
const Vec<Word, 32> &vIn, Vec<Short, 32> vOut[1])
2878 vOut[0] = min(vIn, Vec<Word, 32>(_mm256_set1_epi16(0x7fff)));
2889static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
2890 Vec<Short, 32> vOut[2])
2892 vOut[0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vIn));
2893 vOut[1] = _mm256_cvtepi8_epi16(_mm256_extractf128_si256(vIn, 1));
2896static SIMD_INLINE
void extend(
const Vec<Short, 32> &vIn, Vec<Int, 32> vOut[2])
2898 vOut[0] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vIn));
2899 vOut[1] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(vIn, 1));
2902static SIMD_INLINE
void extend(
const Vec<Short, 32> &vIn,
2903 Vec<Float, 32> vOut[2])
2906 _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(vIn)));
2908 _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(vIn, 1)));
2911static SIMD_INLINE
void extend(
const Vec<Int, 32> &vIn, Vec<Long, 32> vOut[2])
2913 vOut[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(vIn));
2914 vOut[1] = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(vIn, 1));
2917static SIMD_INLINE
void extend(
const Vec<Int, 32> &vIn, Vec<Double, 32> vOut[2])
2919 vOut[0] = _mm256_cvtepi32_pd(_mm256_castsi256_si128(vIn));
2920 vOut[1] = _mm256_cvtepi32_pd(_mm256_extractf128_si256(vIn, 1));
2923static SIMD_INLINE
void extend(
const Vec<Float, 32> &vIn, Vec<Long, 32> vOut[2])
2926 const auto clipped =
2927 _mm256_min_ps(vIn, _mm256_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT64));
2929 cvts(_mm256_cvtps_pd(_mm256_castps256_ps128(clipped)), OutputType<Long>());
2930 vOut[1] = cvts(_mm256_cvtps_pd(_mm256_extractf128_ps(clipped, 1)),
2931 OutputType<Long>());
2934static SIMD_INLINE
void extend(
const Vec<Float, 32> &vIn,
2935 Vec<Double, 32> vOut[2])
2937 vOut[0] = _mm256_cvtps_pd(_mm256_castps256_ps128(vIn));
2938 vOut[1] = _mm256_cvtps_pd(_mm256_extractf128_ps(vIn, 1));
2943static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn, Vec<Word, 32> vOut[2])
2946 Vec<Byte, 32> zero = setzero(OutputType<Byte>(), Integer<32>());
2949 vOut[0] = unpack(vIn, zero, Part<0>(), Bytes<1>());
2950 vOut[1] = unpack(vIn, zero, Part<1>(), Bytes<1>());
2955static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn, Vec<Short, 32> vOut[2])
2957 vOut[0] = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vIn));
2958 vOut[1] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(vIn, 1));
2961static SIMD_INLINE
void extend(
const Vec<Word, 32> &vIn, Vec<Int, 32> vOut[2])
2963 vOut[0] = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(vIn));
2964 vOut[1] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(vIn, 1));
2967static SIMD_INLINE
void extend(
const Vec<Word, 32> &vIn, Vec<Float, 32> vOut[2])
2970 _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(vIn)));
2972 _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extractf128_si256(vIn, 1)));
2977static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
2978 Vec<Word, 32> vOut[2])
2981 const Vec<SignedByte, 32> saturated =
2982 _mm256_max_epi8(vIn, _mm256_setzero_si256());
2983 const Vec<SignedByte, 32> zero = _mm256_setzero_si256();
2984 vOut[0] = unpack(saturated, zero, Part<0>(), Bytes<1>());
2985 vOut[1] = unpack(saturated, zero, Part<1>(), Bytes<1>());
2994static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
2995 Vec<Int, 32> vOut[4])
2997 __m128i vInLo128 = _mm256_castsi256_si128(vIn);
2998 vOut[0] = _mm256_cvtepi8_epi32(vInLo128);
2999 vOut[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(vInLo128, 8));
3000 __m128i vInHi128 = _mm256_extractf128_si256(vIn, 1);
3001 vOut[2] = _mm256_cvtepi8_epi32(vInHi128);
3002 vOut[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(vInHi128, 8));
3005static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
3006 Vec<Float, 32> vOut[4])
3008 Vec<Int, 32> vTmp[4];
3010 for (
size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
3013static SIMD_INLINE
void extend(
const Vec<Short, 32> &vIn, Vec<Long, 32> vOut[4])
3015 Vec<Int, 32> vTmp[2];
3017 extend(vTmp[0], vOut);
3018 extend(vTmp[1], vOut + 2);
3021static SIMD_INLINE
void extend(
const Vec<Short, 32> &vIn,
3022 Vec<Double, 32> vOut[4])
3024 Vec<Int, 32> vTmp[2];
3026 extend(vTmp[0], vOut);
3027 extend(vTmp[1], vOut + 2);
3032static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn, Vec<Int, 32> vOut[4])
3034 __m128i vInLo128 = _mm256_castsi256_si128(vIn);
3035 vOut[0] = _mm256_cvtepu8_epi32(vInLo128);
3036 vOut[1] = _mm256_cvtepu8_epi32(_mm_srli_si128(vInLo128, 8));
3037 __m128i vInHi128 = _mm256_extractf128_si256(vIn, 1);
3038 vOut[2] = _mm256_cvtepu8_epi32(vInHi128);
3039 vOut[3] = _mm256_cvtepu8_epi32(_mm_srli_si128(vInHi128, 8));
3042static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn, Vec<Float, 32> vOut[4])
3044 Vec<Int, 32> vTmp[4];
3046 for (
size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
3049static SIMD_INLINE
void extend(
const Vec<Word, 32> &vIn, Vec<Long, 32> vOut[4])
3051 Vec<Int, 32> vTmp[2];
3053 extend(vTmp[0], vOut);
3054 extend(vTmp[1], vOut + 2);
3057static SIMD_INLINE
void extend(
const Vec<Word, 32> &vIn,
3058 Vec<Double, 32> vOut[4])
3060 Vec<Int, 32> vTmp[2];
3062 extend(vTmp[0], vOut);
3063 extend(vTmp[1], vOut + 2);
3072static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
3073 Vec<Long, 32> vOut[8])
3075 Vec<Int, 32> vTmp[4];
3077 extend(vTmp[0], vOut);
3078 extend(vTmp[1], vOut + 2);
3079 extend(vTmp[2], vOut + 4);
3080 extend(vTmp[3], vOut + 6);
3083static SIMD_INLINE
void extend(
const Vec<SignedByte, 32> &vIn,
3084 Vec<Double, 32> vOut[8])
3086 Vec<Int, 32> vTmp[4];
3088 extend(vTmp[0], vOut);
3089 extend(vTmp[1], vOut + 2);
3090 extend(vTmp[2], vOut + 4);
3091 extend(vTmp[3], vOut + 6);
3096static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn, Vec<Long, 32> vOut[8])
3098 Vec<Int, 32> vTmp[4];
3100 extend(vTmp[0], vOut);
3101 extend(vTmp[1], vOut + 2);
3102 extend(vTmp[2], vOut + 4);
3103 extend(vTmp[3], vOut + 6);
3106static SIMD_INLINE
void extend(
const Vec<Byte, 32> &vIn,
3107 Vec<Double, 32> vOut[8])
3109 Vec<Int, 32> vTmp[4];
3111 extend(vTmp[0], vOut);
3112 extend(vTmp[1], vOut + 2);
3113 extend(vTmp[2], vOut + 4);
3114 extend(vTmp[3], vOut + 6);
3124template <
typename Tout,
typename Tin,
3125 SIMD_ENABLE_IF(
sizeof(Tout) >
sizeof(Tin))>
3126static SIMD_INLINE
void extend(
const Vec<Tin, 32> &vIn,
3127 Vec<Tout, 32> vOut[
sizeof(Tout) /
sizeof(Tin)])
3129 const size_t nOut =
sizeof(Tout) /
sizeof(Tin), nOutHalf = nOut / 2;
3130 Vec<Tout, 16> vOutLo16[nOut], vOutHi16[nOut];
3131 extend(vIn.lo(), vOutLo16);
3132 extend(vIn.hi(), vOutHi16);
3133 for (
size_t i = 0; i < nOutHalf; i++) {
3134 vOut[i] = Vec<Tout, 32>(vOutLo16[2 * i], vOutLo16[2 * i + 1]);
3135 vOut[i + nOutHalf] = Vec<Tout, 32>(vOutHi16[2 * i], vOutHi16[2 * i + 1]);
3145template <
typename Tout,
typename Tin,
3146 SIMD_ENABLE_IF(
sizeof(Tin) ==
sizeof(Tout)),
3147 SIMD_ENABLE_IF(std::is_floating_point<Tin>::value !=
3148 std::is_floating_point<Tout>::value)>
3149static SIMD_INLINE
void extend(
const Vec<Tin, 32> &vIn, Vec<Tout, 32> vOut[1])
3151 vOut[0] = cvts(vIn, OutputType<Tout>());
3161template <
size_t COUNT>
3162static SIMD_INLINE Vec<Byte, 32> srai(
const Vec<Byte, 32> &a)
3164 SIMD_IF_CONSTEXPR (COUNT < 8) {
3165 const __m256i odd = _mm256_srai_epi16(a, COUNT);
3166 const __m256i even = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), COUNT + 8);
3167 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3170 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3174template <
size_t COUNT>
3175static SIMD_INLINE Vec<SignedByte, 32> srai(
const Vec<SignedByte, 32> &a)
3177 SIMD_IF_CONSTEXPR (COUNT < 8) {
3178 const __m256i odd = _mm256_srai_epi16(a, COUNT);
3179 const __m256i even = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), COUNT + 8);
3180 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3183 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3187template <
size_t COUNT>
3188static SIMD_INLINE Vec<Word, 32> srai(
const Vec<Word, 32> &a)
3190 return _mm256_srai_epi16(a, vec::min(COUNT, 15ul));
3193template <
size_t COUNT>
3194static SIMD_INLINE Vec<Short, 32> srai(
const Vec<Short, 32> &a)
3196 return _mm256_srai_epi16(a, vec::min(COUNT, 15ul));
3199template <
size_t COUNT>
3200static SIMD_INLINE Vec<Int, 32> srai(
const Vec<Int, 32> &a)
3202 return _mm256_srai_epi32(a, vec::min(COUNT, 31ul));
3205template <
size_t COUNT>
3206static SIMD_INLINE Vec<Long, 32> srai(
const Vec<Long, 32> &a)
3209 const __m256i odd = _mm256_srai_epi32(a, vec::min(COUNT, 31ul));
3211 SIMD_IF_CONSTEXPR (COUNT < 32) {
3213 _mm256_or_si256(_mm256_srli_epi32(a, COUNT),
3214 _mm256_slli_epi32(_mm256_srli_si256(a, 4), 32 - COUNT));
3217 _mm256_srai_epi32(_mm256_srli_si256(a, 4), vec::min(COUNT - 32, 31ul));
3219 return _mm256_blend_epi16(even, odd, 0xcc);
3225template <
size_t COUNT,
typename T>
3226static SIMD_INLINE Vec<T, 32> srai(
const Vec<T, 32> &a)
3228 return Vec<T, 32>(srai<COUNT>(a.lo()), srai<COUNT>(a.hi()));
3241template <
size_t COUNT>
3242static SIMD_INLINE Vec<Byte, 32> srli(
const Vec<Byte, 32> &a)
3244 SIMD_IF_CONSTEXPR (COUNT < 8) {
3245 return _mm256_and_si256(_mm256_set1_epi8((int8_t) (0xff >> COUNT)),
3246 _mm256_srli_epi32(a, COUNT));
3248 return _mm256_setzero_si256();
3254template <
size_t COUNT>
3255static SIMD_INLINE Vec<SignedByte, 32> srli(
const Vec<SignedByte, 32> &a)
3257 SIMD_IF_CONSTEXPR (COUNT < 8) {
3258 return _mm256_and_si256(_mm256_set1_epi8((int8_t) (0xff >> COUNT)),
3259 _mm256_srli_epi32(a, COUNT));
3261 return _mm256_setzero_si256();
3265template <
size_t COUNT>
3266static SIMD_INLINE Vec<Word, 32> srli(
const Vec<Word, 32> &a)
3268 SIMD_IF_CONSTEXPR (COUNT < 16) {
3269 return _mm256_srli_epi16(a, COUNT);
3271 return _mm256_setzero_si256();
3275template <
size_t COUNT>
3276static SIMD_INLINE Vec<Short, 32> srli(
const Vec<Short, 32> &a)
3278 SIMD_IF_CONSTEXPR (COUNT < 16) {
3279 return _mm256_srli_epi16(a, COUNT);
3281 return _mm256_setzero_si256();
3285template <
size_t COUNT>
3286static SIMD_INLINE Vec<Int, 32> srli(
const Vec<Int, 32> &a)
3288 SIMD_IF_CONSTEXPR (COUNT < 32) {
3289 return _mm256_srli_epi32(a, COUNT);
3291 return _mm256_setzero_si256();
3295template <
size_t COUNT>
3296static SIMD_INLINE Vec<Long, 32> srli(
const Vec<Long, 32> &a)
3298 SIMD_IF_CONSTEXPR (COUNT < 64) {
3299 return _mm256_srli_epi64(a, COUNT);
3301 return _mm256_setzero_si256();
3308template <
size_t COUNT,
typename T>
3309static SIMD_INLINE Vec<T, 32> srli(
const Vec<T, 32> &a)
3311 return Vec<T, 32>(srli<COUNT>(a.lo()), srli<COUNT>(a.hi()));
3322template <
size_t COUNT>
3323static SIMD_INLINE Vec<Byte, 32> slli(
const Vec<Byte, 32> &a)
3325 SIMD_IF_CONSTEXPR (COUNT < 8) {
3328 return _mm256_and_si256(
3329 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
3330 _mm256_slli_epi32(a, COUNT));
3332 return _mm256_setzero_si256();
3336template <
size_t COUNT>
3337static SIMD_INLINE Vec<SignedByte, 32> slli(
const Vec<SignedByte, 32> &a)
3339 SIMD_IF_CONSTEXPR (COUNT < 8) {
3342 return _mm256_and_si256(
3343 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
3344 _mm256_slli_epi32(a, COUNT));
3346 return _mm256_setzero_si256();
3350template <
size_t COUNT>
3351static SIMD_INLINE Vec<Word, 32> slli(
const Vec<Word, 32> &a)
3353 SIMD_IF_CONSTEXPR (COUNT < 16) {
3354 return _mm256_slli_epi16(a, COUNT);
3356 return _mm256_setzero_si256();
3360template <
size_t COUNT>
3361static SIMD_INLINE Vec<Short, 32> slli(
const Vec<Short, 32> &a)
3363 SIMD_IF_CONSTEXPR (COUNT < 16) {
3364 return _mm256_slli_epi16(a, COUNT);
3366 return _mm256_setzero_si256();
3370template <
size_t COUNT>
3371static SIMD_INLINE Vec<Int, 32> slli(
const Vec<Int, 32> &a)
3373 SIMD_IF_CONSTEXPR (COUNT < 32) {
3374 return _mm256_slli_epi32(a, COUNT);
3376 return _mm256_setzero_si256();
3380template <
size_t COUNT>
3381static SIMD_INLINE Vec<Long, 32> slli(
const Vec<Long, 32> &a)
3383 SIMD_IF_CONSTEXPR (COUNT < 64) {
3384 return _mm256_slli_epi64(a, COUNT);
3386 return _mm256_setzero_si256();
3393template <
size_t COUNT,
typename T>
3394static SIMD_INLINE Vec<T, 32> slli(
const Vec<T, 32> &a)
3396 return Vec<T, 32>(slli<COUNT>(a.lo()), slli<COUNT>(a.hi()));
3409static SIMD_INLINE Vec<Byte, 32> sra(
const Vec<Byte, 32> &a,
3410 const uint8_t count)
3414 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3416 const __m256i odd = _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3417 const __m256i even =
3418 _mm256_sra_epi16(_mm256_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
3419 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3422static SIMD_INLINE Vec<SignedByte, 32> sra(
const Vec<SignedByte, 32> &a,
3423 const uint8_t count)
3427 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3429 const __m256i odd = _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3430 const __m256i even =
3431 _mm256_sra_epi16(_mm256_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
3432 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3435static SIMD_INLINE Vec<Word, 32> sra(
const Vec<Word, 32> &a,
3436 const uint8_t count)
3438 return _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3441static SIMD_INLINE Vec<Short, 32> sra(
const Vec<Short, 32> &a,
3442 const uint8_t count)
3444 return _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3447static SIMD_INLINE Vec<Int, 32> sra(
const Vec<Int, 32> &a,
const uint8_t count)
3449 return _mm256_sra_epi32(a, _mm_cvtsi32_si128(count));
3452static SIMD_INLINE Vec<Long, 32> sra(
const Vec<Long, 32> &a,
3453 const uint8_t count)
3457 const __m256i odd = _mm256_sra_epi32(a, _mm_cvtsi32_si128(count));
3460 even = _mm256_or_si256(
3461 _mm256_srl_epi32(a, _mm_cvtsi32_si128(count)),
3462 _mm256_sll_epi32(_mm256_srli_si256(a, 4), _mm_cvtsi32_si128(32 - count)));
3465 _mm256_sra_epi32(_mm256_srli_si256(a, 4), _mm_cvtsi32_si128(count - 32));
3467 return _mm256_blend_epi16(even, odd, 0xcc);
3473template <
typename T>
3474static SIMD_INLINE Vec<T, 32> sra(
const Vec<T, 32> &a,
const uint8_t count)
3476 return Vec<T, 32>(sra(a.lo(), count), sra(a.hi(), count));
3487static SIMD_INLINE Vec<Byte, 32> srl(
const Vec<Byte, 32> &a,
3488 const uint8_t count)
3490 return _mm256_and_si256(_mm256_srl_epi16(a, _mm_cvtsi32_si128(count)),
3491 _mm256_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3494static SIMD_INLINE Vec<SignedByte, 32> srl(
const Vec<SignedByte, 32> &a,
3495 const uint8_t count)
3497 return _mm256_and_si256(_mm256_srl_epi16(a, _mm_cvtsi32_si128(count)),
3498 _mm256_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3501static SIMD_INLINE Vec<Word, 32> srl(
const Vec<Word, 32> &a,
3502 const uint8_t count)
3504 return _mm256_srl_epi16(a, _mm_cvtsi32_si128(count));
3507static SIMD_INLINE Vec<Short, 32> srl(
const Vec<Short, 32> &a,
3508 const uint8_t count)
3510 return _mm256_srl_epi16(a, _mm_cvtsi32_si128(count));
3513static SIMD_INLINE Vec<Int, 32> srl(
const Vec<Int, 32> &a,
const uint8_t count)
3515 return _mm256_srl_epi32(a, _mm_cvtsi32_si128(count));
3518static SIMD_INLINE Vec<Long, 32> srl(
const Vec<Long, 32> &a,
3519 const uint8_t count)
3521 return _mm256_srl_epi64(a, _mm_cvtsi32_si128(count));
3527template <
typename T>
3528static SIMD_INLINE Vec<T, 32> srl(
const Vec<T, 32> &a,
const uint8_t count)
3530 return Vec<T, 32>(srl(a.lo(), count), srl(a.hi(), count));
3541static SIMD_INLINE Vec<Byte, 32> sll(
const Vec<Byte, 32> &a,
3542 const uint8_t count)
3544 return _mm256_and_si256(
3545 _mm256_sll_epi16(a, _mm_cvtsi32_si128(count)),
3546 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3549static SIMD_INLINE Vec<SignedByte, 32> sll(
const Vec<SignedByte, 32> &a,
3550 const uint8_t count)
3552 return _mm256_and_si256(
3553 _mm256_sll_epi16(a, _mm_cvtsi32_si128(count)),
3554 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3557static SIMD_INLINE Vec<Word, 32> sll(
const Vec<Word, 32> &a,
3558 const uint8_t count)
3560 return _mm256_sll_epi16(a, _mm_cvtsi32_si128(count));
3563static SIMD_INLINE Vec<Short, 32> sll(
const Vec<Short, 32> &a,
3564 const uint8_t count)
3566 return _mm256_sll_epi16(a, _mm_cvtsi32_si128(count));
3569static SIMD_INLINE Vec<Int, 32> sll(
const Vec<Int, 32> &a,
const uint8_t count)
3571 return _mm256_sll_epi32(a, _mm_cvtsi32_si128(count));
3574static SIMD_INLINE Vec<Long, 32> sll(
const Vec<Long, 32> &a,
3575 const uint8_t count)
3577 return _mm256_sll_epi64(a, _mm_cvtsi32_si128(count));
3583template <
typename T>
3584static SIMD_INLINE Vec<T, 32> sll(
const Vec<T, 32> &a,
const uint8_t count)
3586 return Vec<T, 32>(sll(a.lo(), count), sll(a.hi(), count));
3599template <
typename T>
3600static SIMD_INLINE Vec<T, 32> hadd(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
3603 unzip<1>(a, b, x, y);
3607static SIMD_INLINE Vec<Word, 32> hadd(
const Vec<Word, 32> &a,
3608 const Vec<Word, 32> &b)
3610 return x_mm256_transpose4x64_epi64(x_mm256_hadd_epi16(a, b));
3613static SIMD_INLINE Vec<Short, 32> hadd(
const Vec<Short, 32> &a,
3614 const Vec<Short, 32> &b)
3616 return x_mm256_transpose4x64_epi64(x_mm256_hadd_epi16(a, b));
3619static SIMD_INLINE Vec<Int, 32> hadd(
const Vec<Int, 32> &a,
3620 const Vec<Int, 32> &b)
3622 return x_mm256_transpose4x64_epi64(x_mm256_hadd_epi32(a, b));
3625static SIMD_INLINE Vec<Float, 32> hadd(
const Vec<Float, 32> &a,
3626 const Vec<Float, 32> &b)
3628 return x_mm256_transpose4x64_ps(_mm256_hadd_ps(a, b));
3631static SIMD_INLINE Vec<Double, 32> hadd(
const Vec<Double, 32> &a,
3632 const Vec<Double, 32> &b)
3634 return x_mm256_transpose4x64_pd(_mm256_hadd_pd(a, b));
3643template <
typename T>
3644static SIMD_INLINE Vec<T, 32> hadds(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
3647 unzip<1>(a, b, x, y);
3651static SIMD_INLINE Vec<Short, 32> hadds(
const Vec<Short, 32> &a,
3652 const Vec<Short, 32> &b)
3654 return x_mm256_transpose4x64_epi64(x_mm256_hadds_epi16(a, b));
3658static SIMD_INLINE Vec<Float, 32> hadds(
const Vec<Float, 32> &a,
3659 const Vec<Float, 32> &b)
3661 return x_mm256_transpose4x64_ps(_mm256_hadd_ps(a, b));
3665static SIMD_INLINE Vec<Double, 32> hadds(
const Vec<Double, 32> &a,
3666 const Vec<Double, 32> &b)
3668 return x_mm256_transpose4x64_pd(_mm256_hadd_pd(a, b));
3675template <
typename T>
3676static SIMD_INLINE Vec<T, 32> hsub(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
3679 unzip<1>(a, b, x, y);
3683static SIMD_INLINE Vec<Word, 32> hsub(
const Vec<Word, 32> &a,
3684 const Vec<Word, 32> &b)
3686 return x_mm256_transpose4x64_epi64(x_mm256_hsub_epi16(a, b));
3689static SIMD_INLINE Vec<Short, 32> hsub(
const Vec<Short, 32> &a,
3690 const Vec<Short, 32> &b)
3692 return x_mm256_transpose4x64_epi64(x_mm256_hsub_epi16(a, b));
3695static SIMD_INLINE Vec<Int, 32> hsub(
const Vec<Int, 32> &a,
3696 const Vec<Int, 32> &b)
3698 return x_mm256_transpose4x64_epi64(x_mm256_hsub_epi32(a, b));
3701static SIMD_INLINE Vec<Float, 32> hsub(
const Vec<Float, 32> &a,
3702 const Vec<Float, 32> &b)
3704 return x_mm256_transpose4x64_ps(_mm256_hsub_ps(a, b));
3707static SIMD_INLINE Vec<Double, 32> hsub(
const Vec<Double, 32> &a,
3708 const Vec<Double, 32> &b)
3710 return x_mm256_transpose4x64_pd(_mm256_hsub_pd(a, b));
3719template <
typename T>
3720static SIMD_INLINE Vec<T, 32> hsubs(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
3723 unzip<1>(a, b, x, y);
3727static SIMD_INLINE Vec<Short, 32> hsubs(
const Vec<Short, 32> &a,
3728 const Vec<Short, 32> &b)
3730 return x_mm256_transpose4x64_epi64(x_mm256_hsubs_epi16(a, b));
3734static SIMD_INLINE Vec<Float, 32> hsubs(
const Vec<Float, 32> &a,
3735 const Vec<Float, 32> &b)
3737 return x_mm256_transpose4x64_ps(_mm256_hsub_ps(a, b));
3741static SIMD_INLINE Vec<Double, 32> hsubs(
const Vec<Double, 32> &a,
3742 const Vec<Double, 32> &b)
3744 return x_mm256_transpose4x64_pd(_mm256_hsub_pd(a, b));
3751template <
size_t COUNT,
typename T>
3752static SIMD_INLINE Vec<T, 32> srle(
const Vec<T, 32> &a)
3754 const __m256i aInt = reinterpret(a, OutputType<Int>());
3755 const Vec<Int, 32> aShifted = x_mm256_srli256_si256<COUNT * sizeof(T)>(aInt);
3756 return reinterpret(aShifted, OutputType<T>());
3763template <
size_t COUNT,
typename T>
3764static SIMD_INLINE Vec<T, 32> slle(
const Vec<T, 32> &a)
3766 const __m256i aInt = reinterpret(a, OutputType<Int>());
3767 const Vec<Int, 32> aShifted = x_mm256_slli256_si256<COUNT * sizeof(T)>(aInt);
3768 return reinterpret(aShifted, OutputType<T>());
3776template <
size_t COUNT,
typename T>
3777static SIMD_INLINE Vec<T, 32> alignre(
const Vec<T, 32> &h,
const Vec<T, 32> &l)
3779 const auto intH = reinterpret(h, OutputType<Int>());
3780 const auto intL = reinterpret(l, OutputType<Int>());
3781 const Vec<Int, 32> intRes =
3782 x_mm256_alignr256_epi8<COUNT * sizeof(T)>(intH, intL);
3783 return reinterpret(intRes, OutputType<T>());
3793template <
size_t ALIGNOFF>
3794static SIMD_INLINE __m256i align_shuffle_256(__m256i lo, __m256i hi,
3797 static_assert(ALIGNOFF < 32,
"");
3798 return x_mm256_shuffle_epi8(x_mm256_alignr_epi8<ALIGNOFF>(hi, lo), mask);
3810template <
typename T>
3811static SIMD_INLINE
void swizzle(Vec<T, 32>[1], Integer<1>)
3819template <
typename T,
3820 SIMD_ENABLE_IF(
sizeof(T) <= 2 && std::is_integral<T>::value)>
3821static SIMD_INLINE
void swizzle(Vec<T, 32> v[2], Integer<2>)
3824 swizzle_32_16<2>(v, vs);
3825 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<2, T>());
3826 const __m256i s[2] = {
3827 x_mm256_shuffle_epi8(vs[0], mask),
3828 x_mm256_shuffle_epi8(vs[1], mask),
3830 v[0] = x_mm256_unpacklo_epi64(s[0], s[1]);
3831 v[1] = x_mm256_unpackhi_epi64(s[0], s[1]);
3835template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3836static SIMD_INLINE
void swizzle(Vec<T, 32> v[2], Integer<2>)
3838 const Vec<Float, 32> vFloat[2] = {
3839 reinterpret(v[0], OutputType<Float>()),
3840 reinterpret(v[1], OutputType<Float>()),
3842 Vec<Float, 32> vs[2];
3843 swizzle_32_16<2>(vFloat, vs);
3844 const Vec<Float, 32> vOut[2] = {
3845 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(2, 0, 2, 0)),
3846 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(3, 1, 3, 1)),
3848 v[0] = reinterpret(vOut[0], OutputType<T>());
3849 v[1] = reinterpret(vOut[1], OutputType<T>());
3853template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3855static SIMD_INLINE
void swizzle(Vec<T, 32> v[2], Integer<2>)
3857 const Vec<Double, 32> vDouble[2] = {
3858 reinterpret(v[0], OutputType<Double>()),
3859 reinterpret(v[1], OutputType<Double>()),
3861 Vec<Double, 32> vs[2];
3862 swizzle_32_16<2>(vDouble, vs);
3863 const Vec<Double, 32> vOut[2] = {
3864 _mm256_shuffle_pd(vs[0], vs[1], 0),
3865 _mm256_shuffle_pd(vs[0], vs[1], 0xf),
3867 v[0] = reinterpret(vOut[0], OutputType<T>());
3868 v[1] = reinterpret(vOut[1], OutputType<T>());
3874template <
typename T,
3875 SIMD_ENABLE_IF(
sizeof(T) <= 2 && std::is_integral<T>::value)>
3876static SIMD_INLINE
void swizzle(Vec<T, 32> v[3], Integer<3>)
3879 swizzle_32_16<3>(v, vs);
3880 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<3, T>());
3881 const __m256i s0 = align_shuffle_256<0>(vs[0], vs[1], mask);
3882 const __m256i s1 = align_shuffle_256<12>(vs[0], vs[1], mask);
3883 const __m256i s2 = align_shuffle_256<8>(vs[1], vs[2], mask);
3885 align_shuffle_256<4>(vs[2], _mm256_undefined_si256(), mask);
3886 const __m256i l01 = x_mm256_unpacklo_epi32(s0, s1);
3887 const __m256i h01 = x_mm256_unpackhi_epi32(s0, s1);
3888 const __m256i l23 = x_mm256_unpacklo_epi32(s2, s3);
3889 const __m256i h23 = x_mm256_unpackhi_epi32(s2, s3);
3890 v[0] = x_mm256_unpacklo_epi64(l01, l23);
3891 v[1] = x_mm256_unpackhi_epi64(l01, l23);
3892 v[2] = x_mm256_unpacklo_epi64(h01, h23);
3898template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3899static SIMD_INLINE
void swizzle(Vec<T, 32> v[3], Integer<3>)
3901 const Vec<Float, 32> vFloat[3] = {
3902 reinterpret(v[0], OutputType<Float>()),
3903 reinterpret(v[1], OutputType<Float>()),
3904 reinterpret(v[2], OutputType<Float>()),
3906 Vec<Float, 32> vs[3];
3907 swizzle_32_16<3>(vFloat, vs);
3911 const Vec<Float, 32> x2y2x3y3 =
3912 _mm256_shuffle_ps(vs[1], vs[2], _MM_SHUFFLE(2, 1, 3, 2));
3913 const Vec<Float, 32> y0z0y1z1 =
3914 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(1, 0, 2, 1));
3915 const Vec<Float, 32> x0x1x2x3 =
3916 _mm256_shuffle_ps(vs[0], x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
3917 const Vec<Float, 32> y0y1y2y3 =
3918 _mm256_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
3919 const Vec<Float, 32> z0z1z2z3 =
3920 _mm256_shuffle_ps(y0z0y1z1, vs[2], _MM_SHUFFLE(3, 0, 3, 1));
3921 v[0] = reinterpret(x0x1x2x3, OutputType<T>());
3922 v[1] = reinterpret(y0y1y2y3, OutputType<T>());
3923 v[2] = reinterpret(z0z1z2z3, OutputType<T>());
3927template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3929static SIMD_INLINE
void swizzle(Vec<T, 32> v[3], Integer<3>)
3931 const Vec<Double, 32> vDouble[3] = {
3932 reinterpret(v[0], OutputType<Double>()),
3933 reinterpret(v[1], OutputType<Double>()),
3934 reinterpret(v[2], OutputType<Double>()),
3936 Vec<Double, 32> vs[3];
3937 swizzle_32_16<3>(vDouble, vs);
3941 const Vec<Double, 32> vOut[3] = {
3943 _mm256_shuffle_pd(vs[0], vs[1], 0xa),
3945 _mm256_shuffle_pd(vs[0], vs[2], 0x5),
3947 _mm256_shuffle_pd(vs[1], vs[2], 0xa),
3949 v[0] = reinterpret(vOut[0], OutputType<T>());
3950 v[1] = reinterpret(vOut[1], OutputType<T>());
3951 v[2] = reinterpret(vOut[2], OutputType<T>());
3957template <
typename T,
3958 SIMD_ENABLE_IF((
sizeof(T) <= 2 && std::is_integral<T>::value))>
3959static SIMD_INLINE
void swizzle(Vec<T, 32> v[4], Integer<4>)
3962 swizzle_32_16<4>(v, vs);
3963 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<4, T>());
3964 const __m256i s[4] = {
3965 x_mm256_shuffle_epi8(vs[0], mask),
3966 x_mm256_shuffle_epi8(vs[1], mask),
3967 x_mm256_shuffle_epi8(vs[2], mask),
3968 x_mm256_shuffle_epi8(vs[3], mask),
3970 const __m256i l01 = x_mm256_unpacklo_epi32(s[0], s[1]);
3971 const __m256i h01 = x_mm256_unpackhi_epi32(s[0], s[1]);
3972 const __m256i l23 = x_mm256_unpacklo_epi32(s[2], s[3]);
3973 const __m256i h23 = x_mm256_unpackhi_epi32(s[2], s[3]);
3974 v[0] = x_mm256_unpacklo_epi64(l01, l23);
3975 v[1] = x_mm256_unpackhi_epi64(l01, l23);
3976 v[2] = x_mm256_unpacklo_epi64(h01, h23);
3977 v[3] = x_mm256_unpackhi_epi64(h01, h23);
3981template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3982static SIMD_INLINE
void swizzle(Vec<T, 32> v[4], Integer<4>)
3984 Vec<Float, 32> vInt[4];
3985 for (
size_t i = 0; i < 4; ++i) {
3986 vInt[i] = reinterpret(v[i], OutputType<Float>());
3988 Vec<Float, 32> vs[4];
3989 swizzle_32_16<4>(vInt, vs);
3990 const __m256 s[4] = {
3991 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(1, 0, 1, 0)),
3992 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(3, 2, 3, 2)),
3993 _mm256_shuffle_ps(vs[2], vs[3], _MM_SHUFFLE(1, 0, 1, 0)),
3994 _mm256_shuffle_ps(vs[2], vs[3], _MM_SHUFFLE(3, 2, 3, 2)),
3996 const Vec<Float, 32> vOut[4] = {
3997 _mm256_shuffle_ps(s[0], s[2], _MM_SHUFFLE(2, 0, 2, 0)),
3998 _mm256_shuffle_ps(s[0], s[2], _MM_SHUFFLE(3, 1, 3, 1)),
3999 _mm256_shuffle_ps(s[1], s[3], _MM_SHUFFLE(2, 0, 2, 0)),
4000 _mm256_shuffle_ps(s[1], s[3], _MM_SHUFFLE(3, 1, 3, 1)),
4002 for (
size_t i = 0; i < 4; ++i) {
4003 v[i] = reinterpret(vOut[i], OutputType<T>());
4008template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
4010static SIMD_INLINE
void swizzle(Vec<T, 32> v[4], Integer<4>)
4012 const Vec<Double, 32> vInt[4] = {
4013 reinterpret(v[0], OutputType<Double>()),
4014 reinterpret(v[1], OutputType<Double>()),
4015 reinterpret(v[2], OutputType<Double>()),
4016 reinterpret(v[3], OutputType<Double>()),
4018 Vec<Double, 32> vs[4];
4019 swizzle_32_16<4>(vInt, vs);
4024 const Vec<Double, 32> vOut[4] = {
4026 _mm256_shuffle_pd(vs[0], vs[2], 0x0),
4028 _mm256_shuffle_pd(vs[0], vs[2], 0xF),
4030 _mm256_shuffle_pd(vs[1], vs[3], 0x0),
4032 _mm256_shuffle_pd(vs[1], vs[3], 0xF),
4034 for (
size_t i = 0; i < 4; ++i) {
4035 v[i] = reinterpret(vOut[i], OutputType<T>());
4042template <
typename T,
4043 SIMD_ENABLE_IF(
sizeof(T) == 1 && std::is_integral<T>::value)>
4044static SIMD_INLINE
void swizzle(Vec<T, 32> v[5], Integer<5>)
4047 swizzle_32_16<5>(v, vs);
4048 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<5, T>());
4049 const __m256i s0 = align_shuffle_256<0>(vs[0], vs[1], mask);
4050 const __m256i s1 = align_shuffle_256<10>(vs[0], vs[1], mask);
4051 const __m256i s2 = align_shuffle_256<4>(vs[1], vs[2], mask);
4052 const __m256i s3 = align_shuffle_256<14>(vs[1], vs[2], mask);
4053 const __m256i s4 = align_shuffle_256<8>(vs[2], vs[3], mask);
4054 const __m256i s5 = align_shuffle_256<2>(vs[3], vs[4], mask);
4055 const __m256i s6 = align_shuffle_256<12>(vs[3], vs[4], mask);
4057 align_shuffle_256<6>(vs[4], _mm256_undefined_si256(), mask);
4058 const __m256i l01 = x_mm256_unpacklo_epi16(s0, s1);
4059 const __m256i h01 = x_mm256_unpackhi_epi16(s0, s1);
4060 const __m256i l23 = x_mm256_unpacklo_epi16(s2, s3);
4061 const __m256i h23 = x_mm256_unpackhi_epi16(s2, s3);
4062 const __m256i l45 = x_mm256_unpacklo_epi16(s4, s5);
4063 const __m256i h45 = x_mm256_unpackhi_epi16(s4, s5);
4064 const __m256i l67 = x_mm256_unpacklo_epi16(s6, s7);
4065 const __m256i h67 = x_mm256_unpackhi_epi16(s6, s7);
4066 const __m256i ll01l23 = x_mm256_unpacklo_epi32(l01, l23);
4067 const __m256i hl01l23 = x_mm256_unpackhi_epi32(l01, l23);
4068 const __m256i ll45l67 = x_mm256_unpacklo_epi32(l45, l67);
4069 const __m256i hl45l67 = x_mm256_unpackhi_epi32(l45, l67);
4070 const __m256i lh01h23 = x_mm256_unpacklo_epi32(h01, h23);
4071 const __m256i lh45h67 = x_mm256_unpacklo_epi32(h45, h67);
4072 v[0] = x_mm256_unpacklo_epi64(ll01l23, ll45l67);
4073 v[1] = x_mm256_unpackhi_epi64(ll01l23, ll45l67);
4074 v[2] = x_mm256_unpacklo_epi64(hl01l23, hl45l67);
4075 v[3] = x_mm256_unpackhi_epi64(hl01l23, hl45l67);
4076 v[4] = x_mm256_unpacklo_epi64(lh01h23, lh45h67);
4080template <
typename T,
4081 SIMD_ENABLE_IF(
sizeof(T) == 2 && std::is_integral<T>::value),
4083static SIMD_INLINE
void swizzle(Vec<T, 32> v[5], Integer<5>)
4086 swizzle_32_16<5>(v, vs);
4087 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<5, T>());
4088 const __m256i s0 = align_shuffle_256<0>(vs[0], vs[1], mask);
4089 const __m256i s1 = align_shuffle_256<6>(vs[0], vs[1], mask);
4090 const __m256i s2 = align_shuffle_256<4>(vs[1], vs[2], mask);
4091 const __m256i s3 = align_shuffle_256<10>(vs[1], vs[2], mask);
4092 const __m256i s4 = align_shuffle_256<8>(vs[2], vs[3], mask);
4093 const __m256i s5 = align_shuffle_256<14>(vs[2], vs[3], mask);
4094 const __m256i s6 = align_shuffle_256<12>(vs[3], vs[4], mask);
4096 align_shuffle_256<2>(vs[4], _mm256_undefined_si256(), mask);
4097 const __m256i l02 = x_mm256_unpacklo_epi32(s0, s2);
4098 const __m256i h02 = x_mm256_unpackhi_epi32(s0, s2);
4099 const __m256i l13 = x_mm256_unpacklo_epi32(s1, s3);
4100 const __m256i l46 = x_mm256_unpacklo_epi32(s4, s6);
4101 const __m256i h46 = x_mm256_unpackhi_epi32(s4, s6);
4102 const __m256i l57 = x_mm256_unpacklo_epi32(s5, s7);
4103 v[0] = x_mm256_unpacklo_epi64(l02, l46);
4104 v[1] = x_mm256_unpackhi_epi64(l02, l46);
4105 v[2] = x_mm256_unpacklo_epi64(h02, h46);
4106 v[3] = x_mm256_unpacklo_epi64(l13, l57);
4107 v[4] = x_mm256_unpackhi_epi64(l13, l57);
4111template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 4),
typename = void,
4113static SIMD_INLINE
void swizzle(Vec<T, 32> v[5], Integer<5>)
4115 Vec<Int, 32> vInt[5];
4116 for (
size_t i = 0; i < 5; i++) {
4117 vInt[i] = reinterpret(v[i], OutputType<Int>());
4120 swizzle_32_16<5>(vInt, vs);
4127 const __m256i s2 = x_mm256_alignr_epi8<4>(vs[2], vs[1]);
4131 const __m256i s3 = x_mm256_alignr_epi8<4>(vs[3], vs[2]);
4135 const __m256i s4 = x_mm256_alignr_epi8<8>(vs[3], vs[2]);
4139 const __m256i s5 = x_mm256_alignr_epi8<8>(vs[4], vs[3]);
4143 const __m256i s6 = x_mm256_alignr_epi8<12>(vs[4], vs[3]);
4147 const __m256i s7 = x_mm256_alignr_epi8<12>(vs[0], vs[4]);
4149 const __m256i l02 = x_mm256_unpacklo_epi32(vs[0], s2);
4150 const __m256i h02 = x_mm256_unpackhi_epi32(vs[0], s2);
4152 const __m256i l13 = x_mm256_unpacklo_epi32(vs[1], s3);
4154 const __m256i l46 = x_mm256_unpacklo_epi32(s4, s6);
4155 const __m256i h46 = x_mm256_unpackhi_epi32(s4, s6);
4157 const __m256i l57 = x_mm256_unpacklo_epi32(s5, s7);
4158 const Vec<Int, 32> vOut[5] = {
4160 x_mm256_unpacklo_epi64(l02, l46),
4161 x_mm256_unpackhi_epi64(l02, l46),
4163 x_mm256_unpacklo_epi64(h02, h46),
4164 x_mm256_unpackhi_epi64(h02, h46),
4166 x_mm256_unpacklo_epi64(l13, l57),
4168 for (
size_t i = 0; i < 5; i++) {
4169 v[i] = reinterpret(vOut[i], OutputType<T>());
4174template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
4175 typename = void,
typename =
void>
4176static SIMD_INLINE
void swizzle(Vec<T, 32> v[5], Integer<5>)
4178 const Vec<Double, 32> vDouble[5] = {
4179 reinterpret(v[0], OutputType<Double>()),
4180 reinterpret(v[1], OutputType<Double>()),
4181 reinterpret(v[2], OutputType<Double>()),
4182 reinterpret(v[3], OutputType<Double>()),
4183 reinterpret(v[4], OutputType<Double>()),
4185 Vec<Double, 32> vs[5];
4186 swizzle_32_16<5>(vDouble, vs);
4192 const Vec<Double, 32> vOut[5] = {
4194 _mm256_shuffle_pd(vs[0], vs[2], 0xa),
4196 _mm256_shuffle_pd(vs[0], vs[3], 0x5),
4198 _mm256_shuffle_pd(vs[1], vs[3], 0xa),
4200 _mm256_shuffle_pd(vs[1], vs[4], 0x5),
4202 _mm256_shuffle_pd(vs[2], vs[4], 0xa),
4204 for (
size_t i = 0; i < 5; i++) {
4205 v[i] = reinterpret(vOut[i], OutputType<T>());
4226static SIMD_INLINE Vec<Byte, 32> cmplt(
const Vec<Byte, 32> &a,
4227 const Vec<Byte, 32> &b)
4229 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4230 const __m256i a1 = _mm256_xor_si256(a, signbit);
4231 const __m256i b1 = _mm256_xor_si256(b, signbit);
4232 return _mm256_cmpgt_epi8(b1, a1);
4235static SIMD_INLINE Vec<SignedByte, 32> cmplt(
const Vec<SignedByte, 32> &a,
4236 const Vec<SignedByte, 32> &b)
4238 return _mm256_cmpgt_epi8(b, a);
4241static SIMD_INLINE Vec<Word, 32> cmplt(
const Vec<Word, 32> &a,
4242 const Vec<Word, 32> &b)
4244 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4245 const __m256i a1 = _mm256_xor_si256(a, signbit);
4246 const __m256i b1 = _mm256_xor_si256(b, signbit);
4247 return _mm256_cmpgt_epi16(b1, a1);
4250static SIMD_INLINE Vec<Short, 32> cmplt(
const Vec<Short, 32> &a,
4251 const Vec<Short, 32> &b)
4253 return _mm256_cmpgt_epi16(b, a);
4256static SIMD_INLINE Vec<Int, 32> cmplt(
const Vec<Int, 32> &a,
4257 const Vec<Int, 32> &b)
4259 return _mm256_cmpgt_epi32(b, a);
4262static SIMD_INLINE Vec<Long, 32> cmplt(
const Vec<Long, 32> &a,
4263 const Vec<Long, 32> &b)
4265 return _mm256_cmpgt_epi64(b, a);
4271template <
typename T>
4272static SIMD_INLINE Vec<T, 32> cmplt(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4274 return Vec<T, 32>(cmplt(a.lo(), b.lo()), cmplt(a.hi(), b.hi()));
4279static SIMD_INLINE Vec<Float, 32> cmplt(
const Vec<Float, 32> &a,
4280 const Vec<Float, 32> &b)
4284 return _mm256_cmp_ps(a, b, _CMP_LT_OS);
4287static SIMD_INLINE Vec<Double, 32> cmplt(
const Vec<Double, 32> &a,
4288 const Vec<Double, 32> &b)
4290 return _mm256_cmp_pd(a, b, _CMP_LT_OS);
4302static SIMD_INLINE Vec<Byte, 32> cmple(
const Vec<Byte, 32> &a,
4303 const Vec<Byte, 32> &b)
4305 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4306 const __m256i a1 = _mm256_xor_si256(a, signbit);
4307 const __m256i b1 = _mm256_xor_si256(b, signbit);
4308 return _mm256_or_si256(_mm256_cmpgt_epi8(b1, a1), _mm256_cmpeq_epi8(a1, b1));
4311static SIMD_INLINE Vec<SignedByte, 32> cmple(
const Vec<SignedByte, 32> &a,
4312 const Vec<SignedByte, 32> &b)
4314 return _mm256_or_si256(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(a, b));
4317static SIMD_INLINE Vec<Word, 32> cmple(
const Vec<Word, 32> &a,
4318 const Vec<Word, 32> &b)
4320 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4321 const __m256i a1 = _mm256_xor_si256(a, signbit);
4322 const __m256i b1 = _mm256_xor_si256(b, signbit);
4323 return _mm256_or_si256(_mm256_cmpgt_epi16(b1, a1),
4324 _mm256_cmpeq_epi16(a1, b1));
4327static SIMD_INLINE Vec<Short, 32> cmple(
const Vec<Short, 32> &a,
4328 const Vec<Short, 32> &b)
4330 return _mm256_or_si256(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(a, b));
4333static SIMD_INLINE Vec<Int, 32> cmple(
const Vec<Int, 32> &a,
4334 const Vec<Int, 32> &b)
4336 return _mm256_or_si256(_mm256_cmpgt_epi32(b, a), _mm256_cmpeq_epi32(a, b));
4339static SIMD_INLINE Vec<Long, 32> cmple(
const Vec<Long, 32> &a,
4340 const Vec<Long, 32> &b)
4342 return _mm256_or_si256(_mm256_cmpgt_epi64(b, a), _mm256_cmpeq_epi64(a, b));
4348template <
typename T>
4349static SIMD_INLINE Vec<T, 32> cmple(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4351 return Vec<T, 32>(cmple(a.lo(), b.lo()), cmple(a.hi(), b.hi()));
4356static SIMD_INLINE Vec<Float, 32> cmple(
const Vec<Float, 32> &a,
4357 const Vec<Float, 32> &b)
4361 return _mm256_cmp_ps(a, b, _CMP_LE_OS);
4364static SIMD_INLINE Vec<Double, 32> cmple(
const Vec<Double, 32> &a,
4365 const Vec<Double, 32> &b)
4367 return _mm256_cmp_pd(a, b, _CMP_LE_OS);
4376static SIMD_INLINE Vec<Byte, 32> cmpeq(
const Vec<Byte, 32> &a,
4377 const Vec<Byte, 32> &b)
4379 return _mm256_cmpeq_epi8(a, b);
4382static SIMD_INLINE Vec<SignedByte, 32> cmpeq(
const Vec<SignedByte, 32> &a,
4383 const Vec<SignedByte, 32> &b)
4385 return _mm256_cmpeq_epi8(a, b);
4388static SIMD_INLINE Vec<Word, 32> cmpeq(
const Vec<Word, 32> &a,
4389 const Vec<Word, 32> &b)
4391 return _mm256_cmpeq_epi16(a, b);
4394static SIMD_INLINE Vec<Short, 32> cmpeq(
const Vec<Short, 32> &a,
4395 const Vec<Short, 32> &b)
4397 return _mm256_cmpeq_epi16(a, b);
4400static SIMD_INLINE Vec<Int, 32> cmpeq(
const Vec<Int, 32> &a,
4401 const Vec<Int, 32> &b)
4403 return _mm256_cmpeq_epi32(a, b);
4406static SIMD_INLINE Vec<Long, 32> cmpeq(
const Vec<Long, 32> &a,
4407 const Vec<Long, 32> &b)
4409 return _mm256_cmpeq_epi64(a, b);
4415template <
typename T>
4416static SIMD_INLINE Vec<T, 32> cmpeq(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4418 return Vec<T, 32>(cmpeq(a.lo(), b.lo()), cmpeq(a.hi(), b.hi()));
4423static SIMD_INLINE Vec<Float, 32> cmpeq(
const Vec<Float, 32> &a,
4424 const Vec<Float, 32> &b)
4428 return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
4431static SIMD_INLINE Vec<Double, 32> cmpeq(
const Vec<Double, 32> &a,
4432 const Vec<Double, 32> &b)
4434 return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
4446static SIMD_INLINE Vec<Byte, 32> cmpgt(
const Vec<Byte, 32> &a,
4447 const Vec<Byte, 32> &b)
4449 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4450 const __m256i a1 = _mm256_xor_si256(a, signbit);
4451 const __m256i b1 = _mm256_xor_si256(b, signbit);
4452 return _mm256_cmpgt_epi8(a1, b1);
4455static SIMD_INLINE Vec<SignedByte, 32> cmpgt(
const Vec<SignedByte, 32> &a,
4456 const Vec<SignedByte, 32> &b)
4458 return _mm256_cmpgt_epi8(a, b);
4461static SIMD_INLINE Vec<Word, 32> cmpgt(
const Vec<Word, 32> &a,
4462 const Vec<Word, 32> &b)
4464 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4465 const __m256i a1 = _mm256_xor_si256(a, signbit);
4466 const __m256i b1 = _mm256_xor_si256(b, signbit);
4467 return _mm256_cmpgt_epi16(a1, b1);
4470static SIMD_INLINE Vec<Short, 32> cmpgt(
const Vec<Short, 32> &a,
4471 const Vec<Short, 32> &b)
4473 return _mm256_cmpgt_epi16(a, b);
4476static SIMD_INLINE Vec<Int, 32> cmpgt(
const Vec<Int, 32> &a,
4477 const Vec<Int, 32> &b)
4479 return _mm256_cmpgt_epi32(a, b);
4482static SIMD_INLINE Vec<Long, 32> cmpgt(
const Vec<Long, 32> &a,
4483 const Vec<Long, 32> &b)
4485 return _mm256_cmpgt_epi64(a, b);
4491template <
typename T>
4492static SIMD_INLINE Vec<T, 32> cmpgt(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4494 return Vec<T, 32>(cmpgt(a.lo(), b.lo()), cmpgt(a.hi(), b.hi()));
4499static SIMD_INLINE Vec<Float, 32> cmpgt(
const Vec<Float, 32> &a,
4500 const Vec<Float, 32> &b)
4504 return _mm256_cmp_ps(b, a, _CMP_LT_OS);
4507static SIMD_INLINE Vec<Double, 32> cmpgt(
const Vec<Double, 32> &a,
4508 const Vec<Double, 32> &b)
4510 return _mm256_cmp_pd(b, a, _CMP_LT_OS);
4522static SIMD_INLINE Vec<Byte, 32> cmpge(
const Vec<Byte, 32> &a,
4523 const Vec<Byte, 32> &b)
4525 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4526 const __m256i a1 = _mm256_xor_si256(a, signbit);
4527 const __m256i b1 = _mm256_xor_si256(b, signbit);
4528 return _mm256_or_si256(_mm256_cmpgt_epi8(a1, b1), _mm256_cmpeq_epi8(a1, b1));
4531static SIMD_INLINE Vec<SignedByte, 32> cmpge(
const Vec<SignedByte, 32> &a,
4532 const Vec<SignedByte, 32> &b)
4534 return _mm256_or_si256(_mm256_cmpgt_epi8(a, b), _mm256_cmpeq_epi8(a, b));
4537static SIMD_INLINE Vec<Word, 32> cmpge(
const Vec<Word, 32> &a,
4538 const Vec<Word, 32> &b)
4540 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4541 const __m256i a1 = _mm256_xor_si256(a, signbit);
4542 const __m256i b1 = _mm256_xor_si256(b, signbit);
4543 return _mm256_or_si256(_mm256_cmpgt_epi16(a1, b1),
4544 _mm256_cmpeq_epi16(a1, b1));
4547static SIMD_INLINE Vec<Short, 32> cmpge(
const Vec<Short, 32> &a,
4548 const Vec<Short, 32> &b)
4550 return _mm256_or_si256(_mm256_cmpgt_epi16(a, b), _mm256_cmpeq_epi16(a, b));
4553static SIMD_INLINE Vec<Int, 32> cmpge(
const Vec<Int, 32> &a,
4554 const Vec<Int, 32> &b)
4556 return _mm256_or_si256(_mm256_cmpgt_epi32(a, b), _mm256_cmpeq_epi32(a, b));
4559static SIMD_INLINE Vec<Long, 32> cmpge(
const Vec<Long, 32> &a,
4560 const Vec<Long, 32> &b)
4562 return _mm256_or_si256(_mm256_cmpgt_epi64(a, b), _mm256_cmpeq_epi64(a, b));
4568template <
typename T>
4569static SIMD_INLINE Vec<T, 32> cmpge(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4571 return Vec<T, 32>(cmpge(a.lo(), b.lo()), cmpge(a.hi(), b.hi()));
4576static SIMD_INLINE Vec<Float, 32> cmpge(
const Vec<Float, 32> &a,
4577 const Vec<Float, 32> &b)
4581 return _mm256_cmp_ps(b, a, _CMP_LE_OS);
4584static SIMD_INLINE Vec<Double, 32> cmpge(
const Vec<Double, 32> &a,
4585 const Vec<Double, 32> &b)
4587 return _mm256_cmp_pd(b, a, _CMP_LE_OS);
4599static SIMD_INLINE Vec<Byte, 32> cmpneq(
const Vec<Byte, 32> &a,
4600 const Vec<Byte, 32> &b)
4602 return _mm256_xor_si256(_mm256_cmpeq_epi8(a, b), _mm256_set1_epi32(-1));
4605static SIMD_INLINE Vec<SignedByte, 32> cmpneq(
const Vec<SignedByte, 32> &a,
4606 const Vec<SignedByte, 32> &b)
4608 return _mm256_xor_si256(_mm256_cmpeq_epi8(a, b), _mm256_set1_epi32(-1));
4611static SIMD_INLINE Vec<Word, 32> cmpneq(
const Vec<Word, 32> &a,
4612 const Vec<Word, 32> &b)
4614 return _mm256_xor_si256(_mm256_cmpeq_epi16(a, b), _mm256_set1_epi32(-1));
4617static SIMD_INLINE Vec<Short, 32> cmpneq(
const Vec<Short, 32> &a,
4618 const Vec<Short, 32> &b)
4620 return _mm256_xor_si256(_mm256_cmpeq_epi16(a, b), _mm256_set1_epi32(-1));
4623static SIMD_INLINE Vec<Int, 32> cmpneq(
const Vec<Int, 32> &a,
4624 const Vec<Int, 32> &b)
4626 return _mm256_xor_si256(_mm256_cmpeq_epi32(a, b), _mm256_set1_epi32(-1));
4629static SIMD_INLINE Vec<Long, 32> cmpneq(
const Vec<Long, 32> &a,
4630 const Vec<Long, 32> &b)
4632 return _mm256_xor_si256(_mm256_cmpeq_epi64(a, b), _mm256_set1_epi32(-1));
4638template <
typename T>
4639static SIMD_INLINE Vec<T, 32> cmpneq(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4641 return Vec<T, 32>(cmpneq(a.lo(), b.lo()), cmpneq(a.hi(), b.hi()));
4646static SIMD_INLINE Vec<Float, 32> cmpneq(
const Vec<Float, 32> &a,
4647 const Vec<Float, 32> &b)
4651 return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ);
4654static SIMD_INLINE Vec<Double, 32> cmpneq(
const Vec<Double, 32> &a,
4655 const Vec<Double, 32> &b)
4657 return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ);
4669template <
typename T, SIMD_ENABLE_IF(sizeof(T) <= 2)>
4670static SIMD_INLINE Vec<T, 32> ifelse(const Vec<T, 32> &cond,
4671 const Vec<T, 32> &trueVal,
4672 const Vec<T, 32> &falseVal)
4675 const Vec<Byte, 32> res =
4676 _mm256_blendv_epi8(re
interpret(falseVal, OutputType<Byte>()),
4677 re
interpret(trueVal, OutputType<Byte>()),
4678 re
interpret(cond, OutputType<Byte>()));
4681 const Vec<Float, 32> res =
4682 _mm256_or_ps(_mm256_and_ps(re
interpret(cond, OutputType<Float>()),
4683 re
interpret(trueVal, OutputType<Float>())),
4684 _mm256_andnot_ps(re
interpret(cond, OutputType<Float>()),
4685 re
interpret(falseVal, OutputType<Float>())));
4687 return re
interpret(res, OutputType<T>());
4691template <
typename T, SIMD_ENABLE_IF(sizeof(T) > 2),
typename =
void>
4692static SIMD_INLINE Vec<T, 32> ifelse(
const Vec<T, 32> &cond,
4693 const Vec<T, 32> &trueVal,
4694 const Vec<T, 32> &falseVal)
4696 const Vec<Float, 32> res =
4697 _mm256_blendv_ps(reinterpret(falseVal, OutputType<Float>()),
4698 reinterpret(trueVal, OutputType<Float>()),
4699 reinterpret(cond, OutputType<Float>()));
4700 return reinterpret(res, OutputType<T>());
4708template <
typename T>
4709static SIMD_INLINE Vec<T, 32> bit_and(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4712 return _mm256_and_si256(a, b);
4715 return _mm256_castps_si256(
4716 _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4721static SIMD_INLINE Vec<Float, 32> bit_and(
const Vec<Float, 32> &a,
4722 const Vec<Float, 32> &b)
4724 return _mm256_and_ps(a, b);
4728static SIMD_INLINE Vec<Double, 32> bit_and(
const Vec<Double, 32> &a,
4729 const Vec<Double, 32> &b)
4731 return _mm256_and_pd(a, b);
4739template <
typename T>
4740static SIMD_INLINE Vec<T, 32> bit_or(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4743 return _mm256_or_si256(a, b);
4746 return _mm256_castps_si256(
4747 _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4752static SIMD_INLINE Vec<Float, 32> bit_or(
const Vec<Float, 32> &a,
4753 const Vec<Float, 32> &b)
4755 return _mm256_or_ps(a, b);
4759static SIMD_INLINE Vec<Double, 32> bit_or(
const Vec<Double, 32> &a,
4760 const Vec<Double, 32> &b)
4762 return _mm256_or_pd(a, b);
4770template <
typename T>
4771static SIMD_INLINE Vec<T, 32> bit_andnot(
const Vec<T, 32> &a,
4772 const Vec<T, 32> &b)
4775 return _mm256_andnot_si256(a, b);
4778 return _mm256_castps_si256(
4779 _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4784static SIMD_INLINE Vec<Float, 32> bit_andnot(
const Vec<Float, 32> &a,
4785 const Vec<Float, 32> &b)
4787 return _mm256_andnot_ps(a, b);
4791static SIMD_INLINE Vec<Double, 32> bit_andnot(
const Vec<Double, 32> &a,
4792 const Vec<Double, 32> &b)
4794 return _mm256_andnot_pd(a, b);
4802template <
typename T>
4803static SIMD_INLINE Vec<T, 32> bit_xor(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4806 return _mm256_xor_si256(a, b);
4809 return _mm256_castps_si256(
4810 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4815static SIMD_INLINE Vec<Float, 32> bit_xor(
const Vec<Float, 32> &a,
4816 const Vec<Float, 32> &b)
4818 return _mm256_xor_ps(a, b);
4822static SIMD_INLINE Vec<Double, 32> bit_xor(
const Vec<Double, 32> &a,
4823 const Vec<Double, 32> &b)
4825 return _mm256_xor_pd(a, b);
4833template <
typename T>
4834static SIMD_INLINE Vec<T, 32> bit_not(
const Vec<T, 32> &a)
4837 return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
4840 return _mm256_castps_si256(_mm256_xor_ps(
4841 _mm256_castsi256_ps(a), _mm256_castsi256_ps(_mm256_set1_epi32(-1))));
4846static SIMD_INLINE Vec<Float, 32> bit_not(
const Vec<Float, 32> &a)
4848 return _mm256_xor_ps(a, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
4852static SIMD_INLINE Vec<Double, 32> bit_not(
const Vec<Double, 32> &a)
4854 return _mm256_xor_pd(a, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
4863static SIMD_INLINE Vec<Byte, 32> avg(
const Vec<Byte, 32> &a,
4864 const Vec<Byte, 32> &b)
4866 return _mm256_avg_epu8(a, b);
4871static SIMD_INLINE Vec<SignedByte, 32> avg(
const Vec<SignedByte, 32> &a,
4872 const Vec<SignedByte, 32> &b)
4875 const __m256i signbit = _mm256_set1_epi8(int8_t(0x80));
4876 const __m256i a1 = _mm256_xor_si256(a, signbit);
4877 const __m256i b1 = _mm256_xor_si256(b, signbit);
4878 const __m256i m1 = _mm256_avg_epu8(a1, b1);
4879 return _mm256_xor_si256(m1, signbit);
4882static SIMD_INLINE Vec<Word, 32> avg(
const Vec<Word, 32> &a,
4883 const Vec<Word, 32> &b)
4885 return _mm256_avg_epu16(a, b);
4890static SIMD_INLINE Vec<Short, 32> avg(
const Vec<Short, 32> &a,
4891 const Vec<Short, 32> &b)
4894 const __m256i signbit = _mm256_set1_epi16(int16_t(0x8000));
4895 const __m256i a1 = _mm256_xor_si256(a, signbit);
4896 const __m256i b1 = _mm256_xor_si256(b, signbit);
4897 const __m256i m1 = _mm256_avg_epu16(a1, b1);
4898 return _mm256_xor_si256(m1, signbit);
4904template <
typename T>
4905static SIMD_INLINE Vec<T, 32> avg(
const Vec<T, 32> &a,
const Vec<T, 32> &b)
4907 return Vec<T, 32>(avg(a.lo(), b.lo()), avg(a.hi(), b.hi()));
4914static SIMD_INLINE Vec<Int, 32> avg(
const Vec<Int, 32> &a,
4915 const Vec<Int, 32> &b)
4917 const auto halfA = srai<1>(a);
4918 const auto halfB = srai<1>(b);
4919 const auto sum = add(halfA, halfB);
4920 const auto lsb = bit_and(bit_or(a, b), set1(
Int(1), Integer<32>()));
4921 return add(sum, lsb);
4926static SIMD_INLINE Vec<Long, 32> avg(
const Vec<Long, 32> &a,
4927 const Vec<Long, 32> &b)
4929 const auto halfA = srai<1>(a);
4930 const auto halfB = srai<1>(b);
4931 const auto sum = add(halfA, halfB);
4932 const auto lsb = bit_and(bit_or(a, b), set1(
Long(1), Integer<32>()));
4933 return add(sum, lsb);
4937static SIMD_INLINE Vec<Float, 32> avg(
const Vec<Float, 32> &a,
4938 const Vec<Float, 32> &b)
4940 return _mm256_mul_ps(_mm256_add_ps(a, b), _mm256_set1_ps(0.5f));
4944static SIMD_INLINE Vec<Double, 32> avg(
const Vec<Double, 32> &a,
4945 const Vec<Double, 32> &b)
4947 return _mm256_mul_pd(_mm256_add_pd(a, b), _mm256_set1_pd(0.5));
4954template <
typename T>
4955static SIMD_INLINE
bool test_all_zeros(
const Vec<T, 32> &a)
4957 const auto intA = reinterpret(a, OutputType<Int>());
4958 return _mm256_testz_si256(intA, intA);
4965template <
typename T>
4966static SIMD_INLINE
bool test_all_ones(
const Vec<T, 32> &a)
4968 const auto intA = reinterpret(a, OutputType<Int>());
4969 return _mm256_testc_si256(intA, _mm256_set1_epi32(-1));
4979static SIMD_INLINE Vec<Byte, 32> reverse(
const Vec<Byte, 32> &a)
4982 const __m256i mask =
4983 _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
4984 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4987 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
4989 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
4991 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
4995static SIMD_INLINE Vec<SignedByte, 32> reverse(
const Vec<SignedByte, 32> &a)
4998 const __m256i mask =
4999 _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
5000 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
5003 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
5005 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
5007 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5011static SIMD_INLINE Vec<Short, 32> reverse(
const Vec<Short, 32> &a)
5014 const __m256i mask =
5015 _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17,
5016 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
5017 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
5019 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
5021 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5025static SIMD_INLINE Vec<Word, 32> reverse(
const Vec<Word, 32> &a)
5028 const __m256i mask =
5029 _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17,
5030 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
5031 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
5033 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
5035 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5039static SIMD_INLINE Vec<Int, 32> reverse(
const Vec<Int, 32> &a)
5042 const __m256i mask = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
5043 return _mm256_permutevar8x32_epi32(a, mask);
5045 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5049static SIMD_INLINE Vec<Long, 32> reverse(
const Vec<Long, 32> &a)
5052 const __m256i mask = _mm256_set_epi32(1, 0, 3, 2, 5, 4, 7, 6);
5053 return _mm256_permutevar8x32_epi32(a, mask);
5055 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5059static SIMD_INLINE Vec<Float, 32> reverse(
const Vec<Float, 32> &a)
5062 const __m256i mask = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
5063 return _mm256_permutevar8x32_ps(a, mask);
5065 return _mm256_set_m128(reverse(a.lo()), reverse(a.hi()));
5069static SIMD_INLINE Vec<Double, 32> reverse(
const Vec<Double, 32> &a)
5072 const __m256i mask = _mm256_set_epi32(1, 0, 3, 2, 5, 4, 7, 6);
5073 return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(a), mask));
5075 return _mm256_set_m128d(reverse(a.lo()), reverse(a.hi()));
5089template <
typename T,
5090 SIMD_ENABLE_IF(std::is_integral<T>::value &&
sizeof(T) == 1)>
5091static SIMD_INLINE uint64_t msb2int(
const Vec<T, 32> &a)
5095 const auto res = _mm256_movemask_epi8(a);
5098 _mm_movemask_epi8(a.lo()) | (_mm_movemask_epi8(a.hi()) << 16);
5101 return uint64_t(uint(res));
5104template <
typename T,
5105 SIMD_ENABLE_IF(std::is_integral<T>::value &&
sizeof(T) == 2),
5107static SIMD_INLINE uint64_t msb2int(
const Vec<T, 32> &a)
5115 uint64_t x = _mm256_movemask_epi8(a);
5117 uint64_t x = _mm_movemask_epi8(a.lo()) | (_mm_movemask_epi8(a.hi()) << 16);
5127 x = ((x & 0x44444444) >> 1) | (x & 0x11111111);
5130 x = ((x & 0x30303030) >> 2) | (x & 0x03030303);
5133 x = ((x & 0x0F000F00) >> 4) | (x & 0x000F000F);
5136 x = ((x & 0x00FF0000) >> 8) | (x & 0x000000FF);
5142static SIMD_INLINE uint64_t msb2int(
const Vec<Int, 32> &a)
5144 return _mm256_movemask_ps(_mm256_castsi256_ps(a));
5147static SIMD_INLINE uint64_t msb2int(
const Vec<Long, 32> &a)
5149 return _mm256_movemask_pd(_mm256_castsi256_pd(a));
5152static SIMD_INLINE uint64_t msb2int(
const Vec<Float, 32> &a)
5154 return _mm256_movemask_ps(a);
5157static SIMD_INLINE uint64_t msb2int(
const Vec<Double, 32> &a)
5159 return _mm256_movemask_pd(a);
5168static SIMD_INLINE Vec<Byte, 32> int2msb(
const uint64_t a, OutputType<Byte>,
5172 const __m256i shuffleIndeces = _mm256_set_epi64x(
5173 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
5174 const __m256i aVec =
5175 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
5176 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
5177 const __m256i selected = _mm256_and_si256(aVec, sel);
5178 const __m256i result = _mm256_cmpeq_epi8(selected, sel);
5179 return _mm256_and_si256(result, _mm256_set1_epi8((int8_t) 0x80));
5181 const __m128i shuffleIndeces = _mm_set_epi64x(0x0101010101010101, 0);
5182 const __m128i aVecLo = _mm_shuffle_epi8(_mm_cvtsi32_si128(a), shuffleIndeces);
5183 const __m128i aVecHi =
5184 _mm_shuffle_epi8(_mm_cvtsi32_si128(a >> 16), shuffleIndeces);
5185 const __m128i sel = _mm_set1_epi64x(0x8040201008040201);
5186 const __m128i selectedLo = _mm_and_si128(aVecLo, sel);
5187 const __m128i selectedHi = _mm_and_si128(aVecHi, sel);
5188 const __m128i resultLo = _mm_cmpeq_epi8(selectedLo, sel);
5189 const __m128i resultHi = _mm_cmpeq_epi8(selectedHi, sel);
5190 const __m256i result = _mm256_set_m128i(resultHi, resultLo);
5191 return _mm256_castps_si256(
5192 _mm256_and_ps(_mm256_castsi256_ps(result),
5193 _mm256_castsi256_ps(_mm256_set1_epi8((int8_t) 0x80))));
5197static SIMD_INLINE Vec<SignedByte, 32> int2msb(
const uint64_t a,
5198 OutputType<SignedByte>,
5201 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<32>()),
5202 OutputType<SignedByte>());
5205static SIMD_INLINE Vec<Short, 32> int2msb(
const uint64_t a, OutputType<Short>,
5209 const __m256i aVec = _mm256_set1_epi16(a);
5210 const __m256i sel = _mm256_set_epi16(
5211 (int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
5212 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
5213 const __m256i selected = _mm256_and_si256(aVec, sel);
5214 const __m256i result = _mm256_cmpeq_epi16(selected, sel);
5215 return _mm256_and_si256(result, _mm256_set1_epi16((int16_t) 0x8000));
5217 const __m128i aVec = _mm_set1_epi16(a);
5218 const __m128i selLo = _mm_set_epi16(0x0080, 0x0040, 0x0020, 0x0010, 0x0008,
5219 0x0004, 0x0002, 0x0001);
5220 const __m128i selHi = _mm_set_epi16((int16_t) 0x8000, 0x4000, 0x2000, 0x1000,
5221 0x0800, 0x0400, 0x0200, 0x0100);
5222 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5223 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5224 const __m128i resultLo = _mm_cmpeq_epi16(selectedLo, selLo);
5225 const __m128i resultHi = _mm_cmpeq_epi16(selectedHi, selHi);
5226 const __m256i result = _mm256_set_m128i(resultHi, resultLo);
5227 return _mm256_castps_si256(
5228 _mm256_and_ps(_mm256_castsi256_ps(result),
5229 _mm256_castsi256_ps(_mm256_set1_epi16((int16_t) 0x8000))));
5233static SIMD_INLINE Vec<Word, 32> int2msb(
const uint64_t a, OutputType<Word>,
5236 return reinterpret(int2msb(a, OutputType<Short>(), Integer<32>()),
5237 OutputType<Word>());
5240static SIMD_INLINE Vec<Int, 32> int2msb(
const uint64_t a, OutputType<Int>,
5244 const __m256i aVec = _mm256_set1_epi32(a);
5246 _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
5247 const __m256i selected = _mm256_and_si256(aVec, sel);
5248 const __m256i result = _mm256_cmpeq_epi32(selected, sel);
5249 return _mm256_and_si256(result, _mm256_set1_epi32(0x80000000));
5251 const __m128i aVec = _mm_set1_epi32(a);
5252 const __m128i selLo = _mm_set_epi32(0x08, 0x04, 0x02, 0x01);
5253 const __m128i selHi = _mm_set_epi32(0x80, 0x40, 0x20, 0x10);
5254 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5255 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5256 const __m256i result = _mm256_set_m128i(_mm_cmpeq_epi32(selectedHi, selHi),
5257 _mm_cmpeq_epi32(selectedLo, selLo));
5258 return _mm256_castps_si256(
5259 _mm256_and_ps(_mm256_castsi256_ps(result),
5260 _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))));
5264static SIMD_INLINE Vec<Long, 32> int2msb(
const uint64_t a, OutputType<Long>,
5268 const __m256i aVec = _mm256_set1_epi64x(a);
5269 const __m256i sel = _mm256_set_epi64x(8, 4, 2, 1);
5270 const __m256i selected = _mm256_and_si256(aVec, sel);
5271 const __m256i result = _mm256_cmpeq_epi64(selected, sel);
5272 return _mm256_and_si256(result, _mm256_set1_epi64x(0x8000000000000000));
5274 const __m128i aVec = _mm_set1_epi64x(a);
5275 const __m128i selLo = _mm_set_epi64x(2, 1);
5276 const __m128i selHi = _mm_set_epi64x(8, 4);
5277 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5278 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5279 const __m256i result = _mm256_set_m128i(_mm_cmpeq_epi64(selectedHi, selHi),
5280 _mm_cmpeq_epi64(selectedLo, selLo));
5281 return _mm256_castpd_si256(
5282 _mm256_and_pd(_mm256_castsi256_pd(result),
5283 _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))));
5287static SIMD_INLINE Vec<Float, 32> int2msb(
const uint64_t a, OutputType<Float>,
5290 return reinterpret(int2msb(a, OutputType<Int>(), Integer<32>()),
5291 OutputType<Float>());
5294static SIMD_INLINE Vec<Double, 32> int2msb(
const uint64_t a, OutputType<Double>,
5297 return reinterpret(int2msb(a, OutputType<Long>(), Integer<32>()),
5298 OutputType<Double>());
5307static SIMD_INLINE Vec<Byte, 32> int2bits(
const uint64_t a, OutputType<Byte>,
5311 const __m256i shuffleIndeces = _mm256_set_epi64x(
5312 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
5313 const __m256i aVec =
5314 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
5315 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
5316 const __m256i selected = _mm256_and_si256(aVec, sel);
5317 return _mm256_cmpeq_epi8(selected, sel);
5319 return _mm256_set_m128i(int2bits(a >> 16, OutputType<Byte>(), Integer<16>()),
5320 int2bits(a, OutputType<Byte>(), Integer<16>()));
5324static SIMD_INLINE Vec<SignedByte, 32> int2bits(
const uint64_t a,
5325 OutputType<SignedByte>,
5328 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<32>()),
5329 OutputType<SignedByte>());
5332static SIMD_INLINE Vec<Short, 32> int2bits(
const uint64_t a, OutputType<Short>,
5336 const __m256i aVec = _mm256_set1_epi16(a);
5337 const __m256i sel = _mm256_set_epi16(
5338 (int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
5339 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
5340 const __m256i selected = _mm256_and_si256(aVec, sel);
5341 return _mm256_cmpeq_epi16(selected, sel);
5343 return _mm256_set_m128i(int2bits(a >> 8, OutputType<Short>(), Integer<16>()),
5344 int2bits(a, OutputType<Short>(), Integer<16>()));
5348static SIMD_INLINE Vec<Word, 32> int2bits(
const uint64_t a, OutputType<Word>,
5351 return reinterpret(int2bits(a, OutputType<Short>(), Integer<32>()),
5352 OutputType<Word>());
5355static SIMD_INLINE Vec<Int, 32> int2bits(
const uint64_t a, OutputType<Int>,
5359 const __m256i aVec = _mm256_set1_epi32(a);
5361 _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
5362 const __m256i selected = _mm256_and_si256(aVec, sel);
5363 return _mm256_cmpeq_epi32(selected, sel);
5365 return _mm256_set_m128i(int2bits(a >> 4, OutputType<Int>(), Integer<16>()),
5366 int2bits(a, OutputType<Int>(), Integer<16>()));
5370static SIMD_INLINE Vec<Long, 32> int2bits(
const uint64_t a, OutputType<Long>,
5374 const __m256i aVec = _mm256_set1_epi64x(a);
5375 const __m256i sel = _mm256_set_epi64x(8, 4, 2, 1);
5376 const __m256i selected = _mm256_and_si256(aVec, sel);
5377 return _mm256_cmpeq_epi64(selected, sel);
5379 const __m128i aVec = _mm_set1_epi64x(a);
5380 const __m128i selLo = _mm_set_epi64x(2, 1);
5381 const __m128i selHi = _mm_set_epi64x(8, 4);
5382 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5383 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5384 return _mm256_set_m128i(_mm_cmpeq_epi64(selectedHi, selHi),
5385 _mm_cmpeq_epi64(selectedLo, selLo));
5389static SIMD_INLINE Vec<Float, 32> int2bits(
const uint64_t a, OutputType<Float>,
5392 return reinterpret(int2bits(a, OutputType<Int>(), Integer<32>()),
5393 OutputType<Float>());
5396static SIMD_INLINE Vec<Double, 32> int2bits(
const uint64_t a,
5397 OutputType<Double>, Integer<32>)
5399 return reinterpret(int2bits(a, OutputType<Long>(), Integer<32>()),
5400 OutputType<Double>());
5409static SIMD_INLINE Vec<Byte, 32> iota(OutputType<Byte>, Integer<32>)
5411 return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
5412 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
5416static SIMD_INLINE Vec<SignedByte, 32> iota(OutputType<SignedByte>, Integer<32>)
5418 return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
5419 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
5423static SIMD_INLINE Vec<Short, 32> iota(OutputType<Short>, Integer<32>)
5425 return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
5428static SIMD_INLINE Vec<Word, 32> iota(OutputType<Word>, Integer<32>)
5430 return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
5433static SIMD_INLINE Vec<Int, 32> iota(OutputType<Int>, Integer<32>)
5435 return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
5438static SIMD_INLINE Vec<Long, 32> iota(OutputType<Long>, Integer<32>)
5440 return _mm256_set_epi64x(3, 2, 1, 0);
5443static SIMD_INLINE Vec<Float, 32> iota(OutputType<Float>, Integer<32>)
5445 return _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
5448static SIMD_INLINE Vec<Double, 32> iota(OutputType<Double>, Integer<32>)
5450 return _mm256_set_pd(3.0, 2.0, 1.0, 0.0);
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161