34#ifndef SIMD_VEC_BASE_IMPL_INTEL_16_H_
35#define SIMD_VEC_BASE_IMPL_INTEL_16_H_
41#include "SSSE3_compat.H"
42#include "intrins_intel.H"
50#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_16_AVAIL_) && \
51 !defined(SIMDVEC_SANDBOX)
81 __m128i xmm = _mm_setzero_si128();
85 static constexpr size_t elements = 16 /
sizeof(T);
87 static constexpr size_t bytes = 16;
90 Vec(
const __m128i &x) { xmm = x; }
91 Vec &operator=(
const __m128i &x)
96 operator __m128i()
const {
return xmm; }
115 __m128 xmm = _mm_setzero_ps();
121 static constexpr size_t bytes = 16;
124 Vec(
const __m128 &x) { xmm = x; }
125 Vec &operator=(
const __m128 &x)
130 operator __m128()
const {
return xmm; }
155 static constexpr size_t bytes = 16;
157 Vec(
const __m128d &x) { xmm = x; }
158 Vec &operator=(
const __m128d &x)
163 operator __m128d()
const {
return xmm; }
184template <
typename Tdst,
typename Tsrc,
185 SIMD_ENABLE_IF((!std::is_same<Tdst, Tsrc>::value &&
186 std::is_integral<Tdst>::value &&
187 std::is_integral<Tsrc>::value))>
188static SIMD_INLINE Vec<Tdst, 16> reinterpret(
const Vec<Tsrc, 16> &vec,
194 return Vec<Tdst, 16>(__m128i(vec));
198template <
typename Tdst, SIMD_ENABLE_IF((std::is_
integral<Tdst>::value))>
199static SIMD_INLINE Vec<Tdst, 16> reinterpret(
const Vec<Float, 16> &vec,
202 return _mm_castps_si128(vec);
206template <
typename Tsrc, SIMD_ENABLE_IF((std::is_
integral<Tsrc>::value))>
207static SIMD_INLINE Vec<Float, 16> reinterpret(
const Vec<Tsrc, 16> &vec,
210 return _mm_castsi128_ps(vec);
214template <
typename Tdst, SIMD_ENABLE_IF((std::is_
integral<Tdst>::value))>
215static SIMD_INLINE Vec<Tdst, 16> reinterpret(
const Vec<Double, 16> &vec,
218 return _mm_castpd_si128(vec);
222template <
typename Tsrc, SIMD_ENABLE_IF((std::is_
integral<Tsrc>::value))>
223static SIMD_INLINE Vec<Double, 16> reinterpret(
const Vec<Tsrc, 16> &vec,
226 return _mm_castsi128_pd(vec);
230static SIMD_INLINE Vec<Double, 16> reinterpret(
const Vec<Float, 16> &vec,
233 return _mm_castps_pd(vec);
237static SIMD_INLINE Vec<Float, 16> reinterpret(
const Vec<Double, 16> &vec,
240 return _mm_castpd_ps(vec);
245static SIMD_INLINE Vec<T, 16> reinterpret(
const Vec<T, 16> &vec, OutputType<T>)
261static SIMD_INLINE Vec<Int, 16> cvts(
const Vec<Float, 16> &a, OutputType<Int>)
267 __m128 clip = _mm_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
268 return _mm_cvtps_epi32(_mm_min_ps(clip, a));
272static SIMD_INLINE Vec<Float, 16> cvts(
const Vec<Int, 16> &a, OutputType<Float>)
274 return _mm_cvtepi32_ps(a);
277static SIMD_INLINE Vec<Long, 16> cvts(
const Vec<Double, 16> &a,
285 const auto clip = _mm_set1_pd(MAX_POS_DOUBLE_CONVERTIBLE_TO_INT64);
286 Double tmpD[2] SIMD_ATTR_ALIGNED(16);
287 _mm_store_pd(tmpD, _mm_min_pd(clip, a));
288 Long tmpL[2] SIMD_ATTR_ALIGNED(16);
289 tmpL[0] =
Long(std::rint(tmpD[0]));
290 tmpL[1] =
Long(std::rint(tmpD[1]));
291 return _mm_load_si128((__m128i *) tmpL);
294static SIMD_INLINE Vec<Double, 16> cvts(
const Vec<Long, 16> &a,
298 __m128i xH = _mm_srai_epi32(a, 16);
299 xH = _mm_and_si128(xH, _mm_set1_epi64x(0xffffffff00000000));
301 xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.)));
303 __m128i xL = _mm_blend_epi16(
304 a, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88);
307 _mm_or_si128(_mm_and_si128(a, _mm_set1_epi64x(0x0000ffffffffffff)),
308 _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
310 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH),
311 _mm_set1_pd(442726361368656609280.));
312 return _mm_add_pd(f, _mm_castsi128_pd(xL));
319template <
typename T, SIMD_ENABLE_IF(std::is_
integral<T>::value)>
320static SIMD_INLINE Vec<T, 16> setzero(OutputType<T>, Integer<16>)
322 return _mm_setzero_si128();
325static SIMD_INLINE Vec<Float, 16> setzero(OutputType<Float>, Integer<16>)
327 return _mm_setzero_ps();
330static SIMD_INLINE Vec<Double, 16> setzero(OutputType<Double>, Integer<16>)
332 return _mm_setzero_pd();
339static SIMD_INLINE Vec<Byte, 16> set1(Byte a, Integer<16>)
341 return _mm_set1_epi8(a);
344static SIMD_INLINE Vec<SignedByte, 16> set1(SignedByte a, Integer<16>)
346 return _mm_set1_epi8(a);
349static SIMD_INLINE Vec<Word, 16> set1(Word a, Integer<16>)
351 return _mm_set1_epi16(a);
354static SIMD_INLINE Vec<Short, 16> set1(Short a, Integer<16>)
356 return _mm_set1_epi16(a);
359static SIMD_INLINE Vec<Int, 16> set1(Int a, Integer<16>)
361 return _mm_set1_epi32(a);
364static SIMD_INLINE Vec<Long, 16> set1(Long a, Integer<16>)
366 return _mm_set1_epi64x(a);
369static SIMD_INLINE Vec<Float, 16> set1(Float a, Integer<16>)
371 return _mm_set1_ps(a);
374static SIMD_INLINE Vec<Double, 16> set1(Double a, Integer<16>)
376 return _mm_set1_pd(a);
384static SIMD_INLINE Vec<T, 16> load(
const T *
const p, Integer<16>)
388 SIMD_CHECK_ALIGNMENT(p, 16);
389 return _mm_load_si128((__m128i *) p);
392static SIMD_INLINE Vec<Float, 16> load(
const Float *
const p, Integer<16>)
396 SIMD_CHECK_ALIGNMENT(p, 16);
397 return _mm_load_ps(p);
400static SIMD_INLINE Vec<Double, 16> load(
const Double *
const p, Integer<16>)
404 SIMD_CHECK_ALIGNMENT(p, 16);
405 return _mm_load_pd(p);
413static SIMD_INLINE Vec<T, 16> loadu(
const T *
const p, Integer<16>)
415 return _mm_loadu_si128((__m128i *) p);
418static SIMD_INLINE Vec<Float, 16> loadu(
const Float *
const p, Integer<16>)
420 return _mm_loadu_ps(p);
423static SIMD_INLINE Vec<Double, 16> loadu(
const Double *
const p, Integer<16>)
425 return _mm_loadu_pd(p);
434static SIMD_INLINE
void store(T *
const p,
const Vec<T, 16> &a)
438 SIMD_CHECK_ALIGNMENT(p, 16);
439 _mm_store_si128((__m128i *) p, a);
443static SIMD_INLINE
void store(Float *
const p,
const Vec<Float, 16> &a)
447 SIMD_CHECK_ALIGNMENT(p, 16);
452static SIMD_INLINE
void store(Double *
const p,
const Vec<Double, 16> &a)
456 SIMD_CHECK_ALIGNMENT(p, 16);
466static SIMD_INLINE
void storeu(T *
const p,
const Vec<T, 16> &a)
468 _mm_storeu_si128((__m128i *) p, a);
472static SIMD_INLINE
void storeu(Float *
const p,
const Vec<Float, 16> &a)
478static SIMD_INLINE
void storeu(Double *
const p,
const Vec<Double, 16> &a)
489static SIMD_INLINE
void stream_store(T *
const p,
const Vec<T, 16> &a)
493 SIMD_CHECK_ALIGNMENT(p, 16);
494 _mm_stream_si128((__m128i *) p, a);
498static SIMD_INLINE
void stream_store(Float *
const p,
const Vec<Float, 16> &a)
502 SIMD_CHECK_ALIGNMENT(p, 16);
507static SIMD_INLINE
void stream_store(Double *
const p,
const Vec<Double, 16> &a)
511 SIMD_CHECK_ALIGNMENT(p, 16);
519static SIMD_INLINE
void lfence()
524static SIMD_INLINE
void sfence()
529static SIMD_INLINE
void mfence()
538template <
size_t INDEX>
539static SIMD_INLINE
Byte extract(
const Vec<Byte, 16> &a)
541 SIMD_IF_CONSTEXPR (INDEX == 0) {
542 return _mm_cvtsi128_si32(a);
543 }
else SIMD_IF_CONSTEXPR (INDEX < 16) {
545 return _mm_extract_epi8(a, INDEX);
547 SIMD_IF_CONSTEXPR ((INDEX & 0x1) == 0) {
548 return _mm_extract_epi16(a, INDEX / 2) & 0xff;
550 return _mm_extract_epi16(_mm_srli_epi16(a, 8), INDEX / 2);
558template <
size_t INDEX>
559static SIMD_INLINE
SignedByte extract(
const Vec<SignedByte, 16> &a)
561 SIMD_IF_CONSTEXPR (INDEX == 0) {
562 return _mm_cvtsi128_si32(a);
563 }
else SIMD_IF_CONSTEXPR (INDEX < 16) {
565 return _mm_extract_epi8(a, INDEX);
567 SIMD_IF_CONSTEXPR ((INDEX & 0x1) == 0) {
568 return _mm_extract_epi16(a, INDEX / 2) & 0xff;
570 return _mm_extract_epi16(_mm_srli_epi16(a, 8), INDEX / 2);
578template <
size_t INDEX>
579static SIMD_INLINE
Word extract(
const Vec<Word, 16> &a)
581 SIMD_IF_CONSTEXPR (INDEX == 0) {
582 return _mm_cvtsi128_si32(a);
583 }
else SIMD_IF_CONSTEXPR (INDEX < 8) {
584 return _mm_extract_epi16(a, INDEX);
590template <
size_t INDEX>
591static SIMD_INLINE
Short extract(
const Vec<Short, 16> &a)
593 SIMD_IF_CONSTEXPR (INDEX == 0) {
594 return _mm_cvtsi128_si32(a);
595 }
else SIMD_IF_CONSTEXPR (INDEX < 8) {
596 return _mm_extract_epi16(a, INDEX);
602template <
size_t INDEX>
603static SIMD_INLINE
Int extract(
const Vec<Int, 16> &a)
605 SIMD_IF_CONSTEXPR (INDEX == 0) {
606 return _mm_cvtsi128_si32(a);
607 }
else SIMD_IF_CONSTEXPR (INDEX < 4) {
609 return _mm_extract_epi32(a, INDEX);
611 return _mm_cvtsi128_si32(_mm_srli_si128(a, INDEX * 4));
618template <
size_t INDEX>
619static SIMD_INLINE
Long extract(
const Vec<Long, 16> &a)
621 SIMD_IF_CONSTEXPR (INDEX == 0) {
622 return _mm_cvtsi128_si64(a);
623 }
else SIMD_IF_CONSTEXPR (INDEX == 1) {
624 return _mm_cvtsi128_si64(_mm_srli_si128(a, 8));
630template <
size_t INDEX>
631static SIMD_INLINE
Float extract(
const Vec<Float, 16> &a)
633 SIMD_IF_CONSTEXPR (INDEX == 0) {
634 return ::simd::internal::bit_cast<Float>(
635 _mm_cvtsi128_si32(_mm_castps_si128(a)));
636 }
else SIMD_IF_CONSTEXPR (INDEX < 4) {
638 const int intRes = _mm_extract_ps(a, INDEX);
641 _mm_cvtsi128_si32(_mm_srli_si128(_mm_castps_si128(a), INDEX * 4));
643 return ::simd::internal::bit_cast<Float>(intRes);
649template <
size_t INDEX>
650static SIMD_INLINE
Double extract(
const Vec<Double, 16> &a)
652 SIMD_IF_CONSTEXPR (INDEX == 0) {
653 return ::simd::internal::bit_cast<Double>(
654 _mm_cvtsi128_si64(_mm_castpd_si128(a)));
655 }
else SIMD_IF_CONSTEXPR (INDEX == 1) {
656 return ::simd::internal::bit_cast<Double>(
657 _mm_cvtsi128_si64(_mm_srli_si128(_mm_castpd_si128(a), 8)));
671static SIMD_INLINE Vec<T, 16> ifelse(
const Vec<T, 16> &cond,
672 const Vec<T, 16> &trueVal,
673 const Vec<T, 16> &falseVal)
676 return _mm_blendv_epi8(falseVal, trueVal, cond);
678 return _mm_or_si128(_mm_and_si128(cond, trueVal),
679 _mm_andnot_si128(cond, falseVal));
683static SIMD_INLINE Vec<Float, 16> ifelse(
const Vec<Float, 16> &cond,
684 const Vec<Float, 16> &trueVal,
685 const Vec<Float, 16> &falseVal)
688 return _mm_blendv_ps(falseVal, trueVal, cond);
690 return _mm_or_ps(_mm_and_ps(cond, trueVal), _mm_andnot_ps(cond, falseVal));
694static SIMD_INLINE Vec<Double, 16> ifelse(
const Vec<Double, 16> &cond,
695 const Vec<Double, 16> &trueVal,
696 const Vec<Double, 16> &falseVal)
699 return _mm_blendv_pd(falseVal, trueVal, cond);
701 return _mm_or_pd(_mm_and_pd(cond, trueVal), _mm_andnot_pd(cond, falseVal));
709static SIMD_INLINE Vec<Byte, 16> add(
const Vec<Byte, 16> &a,
710 const Vec<Byte, 16> &b)
712 return _mm_add_epi8(a, b);
715static SIMD_INLINE Vec<SignedByte, 16> add(
const Vec<SignedByte, 16> &a,
716 const Vec<SignedByte, 16> &b)
718 return _mm_add_epi8(a, b);
721static SIMD_INLINE Vec<Word, 16> add(
const Vec<Word, 16> &a,
722 const Vec<Word, 16> &b)
724 return _mm_add_epi16(a, b);
727static SIMD_INLINE Vec<Short, 16> add(
const Vec<Short, 16> &a,
728 const Vec<Short, 16> &b)
730 return _mm_add_epi16(a, b);
733static SIMD_INLINE Vec<Int, 16> add(
const Vec<Int, 16> &a,
734 const Vec<Int, 16> &b)
736 return _mm_add_epi32(a, b);
739static SIMD_INLINE Vec<Long, 16> add(
const Vec<Long, 16> &a,
740 const Vec<Long, 16> &b)
742 return _mm_add_epi64(a, b);
745static SIMD_INLINE Vec<Float, 16> add(
const Vec<Float, 16> &a,
746 const Vec<Float, 16> &b)
748 return _mm_add_ps(a, b);
751static SIMD_INLINE Vec<Double, 16> add(
const Vec<Double, 16> &a,
752 const Vec<Double, 16> &b)
754 return _mm_add_pd(a, b);
761static SIMD_INLINE Vec<Byte, 16> adds(
const Vec<Byte, 16> &a,
762 const Vec<Byte, 16> &b)
764 return _mm_adds_epu8(a, b);
767static SIMD_INLINE Vec<SignedByte, 16> adds(
const Vec<SignedByte, 16> &a,
768 const Vec<SignedByte, 16> &b)
770 return _mm_adds_epi8(a, b);
773static SIMD_INLINE Vec<Word, 16> adds(
const Vec<Word, 16> &a,
774 const Vec<Word, 16> &b)
776 return _mm_adds_epu16(a, b);
779static SIMD_INLINE Vec<Short, 16> adds(
const Vec<Short, 16> &a,
780 const Vec<Short, 16> &b)
782 return _mm_adds_epi16(a, b);
785static SIMD_INLINE Vec<Int, 16> adds(
const Vec<Int, 16> &a,
786 const Vec<Int, 16> &b)
795 const __m128i sum = _mm_add_epi32(a, b);
796 const __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
797 const __m128i sumHasDiffSign = _mm_xor_si128(a, sum);
799 const __m128i overflow =
800 _mm_srai_epi32(_mm_andnot_si128(opsHaveDiffSign, sumHasDiffSign), 31);
804 const __m128i saturatedSum =
805 _mm_xor_si128(_mm_srai_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF));
807 return ifelse(Vec<Int, 16>(overflow), Vec<Int, 16>(saturatedSum),
811static SIMD_INLINE Vec<Long, 16> adds(
const Vec<Long, 16> &a,
812 const Vec<Long, 16> &b)
818 __m128i sum = _mm_add_epi64(a, b);
819 __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
820 __m128i sumHasDiffSign = _mm_xor_si128(a, sum);
823 _mm_srai_epi32(_mm_andnot_si128(opsHaveDiffSign, sumHasDiffSign), 31);
824 __m128i overflow = _mm_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
825 __m128i signMaskA32 = _mm_srai_epi32(a, 31);
826 __m128i signMaskA = _mm_shuffle_epi32(signMaskA32, _MM_SHUFFLE(3, 3, 1, 1));
831 __m128i saturatedSum =
832 _mm_xor_si128(signMaskA, _mm_set1_epi64x(0x7FFFFFFFFFFFFFFF));
834 return ifelse(Vec<Long, 16>(overflow), Vec<Long, 16>(saturatedSum),
839static SIMD_INLINE Vec<Float, 16> adds(
const Vec<Float, 16> &a,
840 const Vec<Float, 16> &b)
842 return _mm_add_ps(a, b);
846static SIMD_INLINE Vec<Double, 16> adds(
const Vec<Double, 16> &a,
847 const Vec<Double, 16> &b)
849 return _mm_add_pd(a, b);
856static SIMD_INLINE Vec<Byte, 16> sub(
const Vec<Byte, 16> &a,
857 const Vec<Byte, 16> &b)
859 return _mm_sub_epi8(a, b);
862static SIMD_INLINE Vec<SignedByte, 16> sub(
const Vec<SignedByte, 16> &a,
863 const Vec<SignedByte, 16> &b)
865 return _mm_sub_epi8(a, b);
868static SIMD_INLINE Vec<Word, 16> sub(
const Vec<Word, 16> &a,
869 const Vec<Word, 16> &b)
871 return _mm_sub_epi16(a, b);
874static SIMD_INLINE Vec<Short, 16> sub(
const Vec<Short, 16> &a,
875 const Vec<Short, 16> &b)
877 return _mm_sub_epi16(a, b);
880static SIMD_INLINE Vec<Int, 16> sub(
const Vec<Int, 16> &a,
881 const Vec<Int, 16> &b)
883 return _mm_sub_epi32(a, b);
886static SIMD_INLINE Vec<Long, 16> sub(
const Vec<Long, 16> &a,
887 const Vec<Long, 16> &b)
889 return _mm_sub_epi64(a, b);
892static SIMD_INLINE Vec<Float, 16> sub(
const Vec<Float, 16> &a,
893 const Vec<Float, 16> &b)
895 return _mm_sub_ps(a, b);
898static SIMD_INLINE Vec<Double, 16> sub(
const Vec<Double, 16> &a,
899 const Vec<Double, 16> &b)
901 return _mm_sub_pd(a, b);
908static SIMD_INLINE Vec<Byte, 16> subs(
const Vec<Byte, 16> &a,
909 const Vec<Byte, 16> &b)
911 return _mm_subs_epu8(a, b);
914static SIMD_INLINE Vec<SignedByte, 16> subs(
const Vec<SignedByte, 16> &a,
915 const Vec<SignedByte, 16> &b)
917 return _mm_subs_epi8(a, b);
920static SIMD_INLINE Vec<Word, 16> subs(
const Vec<Word, 16> &a,
921 const Vec<Word, 16> &b)
923 return _mm_subs_epu16(a, b);
926static SIMD_INLINE Vec<Short, 16> subs(
const Vec<Short, 16> &a,
927 const Vec<Short, 16> &b)
929 return _mm_subs_epi16(a, b);
932static SIMD_INLINE Vec<Int, 16> subs(
const Vec<Int, 16> &a,
933 const Vec<Int, 16> &b)
942 const __m128i diff = _mm_sub_epi32(a, b);
943 const __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
944 const __m128i diffHasDiffSign = _mm_xor_si128(a, diff);
946 const __m128i overflow =
947 _mm_srai_epi32(_mm_and_si128(opsHaveDiffSign, diffHasDiffSign), 31);
951 const __m128i saturatedDiff =
952 _mm_xor_si128(_mm_srai_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF));
954 return ifelse(Vec<Int, 16>(overflow), Vec<Int, 16>(saturatedDiff),
958static SIMD_INLINE Vec<Long, 16> subs(
const Vec<Long, 16> &a,
959 const Vec<Long, 16> &b)
965 __m128i diff = _mm_sub_epi64(a, b);
966 __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
967 __m128i diffHasDiffSign = _mm_xor_si128(a, diff);
970 _mm_srai_epi32(_mm_and_si128(opsHaveDiffSign, diffHasDiffSign), 63);
971 __m128i overflow = _mm_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
972 __m128i signMaskA32 = _mm_srai_epi32(a, 63);
973 __m128i signMaskA = _mm_shuffle_epi32(signMaskA32, _MM_SHUFFLE(3, 3, 1, 1));
978 __m128i saturatedDiff =
979 _mm_xor_si128(signMaskA, _mm_set1_epi64x(0x7FFFFFFFFFFFFFFF));
981 return ifelse(Vec<Long, 16>(overflow), Vec<Long, 16>(saturatedDiff),
982 Vec<Long, 16>(diff));
986static SIMD_INLINE Vec<Float, 16> subs(
const Vec<Float, 16> &a,
987 const Vec<Float, 16> &b)
989 return _mm_sub_ps(a, b);
993static SIMD_INLINE Vec<Double, 16> subs(
const Vec<Double, 16> &a,
994 const Vec<Double, 16> &b)
996 return _mm_sub_pd(a, b);
1003static SIMD_INLINE Vec<SignedByte, 16> neg(
const Vec<SignedByte, 16> &a)
1005 return _mm_sub_epi8(_mm_setzero_si128(), a);
1008static SIMD_INLINE Vec<Short, 16> neg(
const Vec<Short, 16> &a)
1010 return _mm_sub_epi16(_mm_setzero_si128(), a);
1013static SIMD_INLINE Vec<Int, 16> neg(
const Vec<Int, 16> &a)
1015 return _mm_sub_epi32(_mm_setzero_si128(), a);
1018static SIMD_INLINE Vec<Long, 16> neg(
const Vec<Long, 16> &a)
1020 return _mm_sub_epi64(_mm_setzero_si128(), a);
1023static SIMD_INLINE Vec<Float, 16> neg(
const Vec<Float, 16> &a)
1025 return _mm_sub_ps(_mm_setzero_ps(), a);
1028static SIMD_INLINE Vec<Double, 16> neg(
const Vec<Double, 16> &a)
1030 return _mm_xor_pd(a, _mm_set1_pd(-0.0));
1037static SIMD_INLINE Vec<Byte, 16> min(
const Vec<Byte, 16> &a,
1038 const Vec<Byte, 16> &b)
1040 return _mm_min_epu8(a, b);
1043static SIMD_INLINE Vec<SignedByte, 16> min(
const Vec<SignedByte, 16> &a,
1044 const Vec<SignedByte, 16> &b)
1047 return _mm_min_epi8(a, b);
1050 const __m128i signbit = _mm_set1_epi32(0x80808080);
1051 const __m128i a1 = _mm_xor_si128(a, signbit);
1052 const __m128i b1 = _mm_xor_si128(b, signbit);
1053 const __m128i m1 = _mm_min_epu8(a1, b1);
1054 return _mm_xor_si128(m1, signbit);
1058static SIMD_INLINE Vec<Word, 16> min(
const Vec<Word, 16> &a,
1059 const Vec<Word, 16> &b)
1062 return _mm_min_epu16(a, b);
1065 const __m128i signbit = _mm_set1_epi32(0x80008000);
1066 const __m128i a1 = _mm_xor_si128(a, signbit);
1067 const __m128i b1 = _mm_xor_si128(b, signbit);
1068 const __m128i m1 = _mm_min_epi16(a1, b1);
1069 return _mm_xor_si128(m1, signbit);
1073static SIMD_INLINE Vec<Short, 16> min(
const Vec<Short, 16> &a,
1074 const Vec<Short, 16> &b)
1076 return _mm_min_epi16(a, b);
1079static SIMD_INLINE Vec<Int, 16> min(
const Vec<Int, 16> &a,
1080 const Vec<Int, 16> &b)
1083 return _mm_min_epi32(a, b);
1086 const __m128i gt = _mm_cmpgt_epi32(a, b);
1087 return _mm_or_si128(_mm_and_si128(gt, b), _mm_andnot_si128(gt, a));
1094static SIMD_INLINE Vec<Long, 16> min(
const Vec<Long, 16> &a,
1095 const Vec<Long, 16> &b)
1101 const __m128i gt = _mm_cmpgt_epi64(a, b);
1104 const __m128i diff = _mm_sub_epi64(b, a);
1106 const __m128i res = _mm_xor_si128(
1107 diff, _mm_and_si128(_mm_xor_si128(b, a), _mm_xor_si128(diff, b)));
1109 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
1110 _mm_andnot_si128(_mm_xor_si128(b, a), diff));
1114 const __m128i spread32 = _mm_srai_epi32(res, 31);
1115 const __m128i gt = _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1120 return _mm_blendv_epi8(a, b, gt);
1122 return _mm_or_si128(_mm_and_si128(gt, b), _mm_andnot_si128(gt, a));
1126static SIMD_INLINE Vec<Float, 16> min(
const Vec<Float, 16> &a,
1127 const Vec<Float, 16> &b)
1129 return _mm_min_ps(a, b);
1132static SIMD_INLINE Vec<Double, 16> min(
const Vec<Double, 16> &a,
1133 const Vec<Double, 16> &b)
1135 return _mm_min_pd(a, b);
1142static SIMD_INLINE Vec<Byte, 16> max(
const Vec<Byte, 16> &a,
1143 const Vec<Byte, 16> &b)
1145 return _mm_max_epu8(a, b);
1148static SIMD_INLINE Vec<SignedByte, 16> max(
const Vec<SignedByte, 16> &a,
1149 const Vec<SignedByte, 16> &b)
1152 return _mm_max_epi8(a, b);
1155 const __m128i signbit = _mm_set1_epi32(0x80808080);
1156 const __m128i a1 = _mm_xor_si128(a, signbit);
1157 const __m128i b1 = _mm_xor_si128(b, signbit);
1158 const __m128i m1 = _mm_max_epu8(a1, b1);
1159 return _mm_xor_si128(m1, signbit);
1163static SIMD_INLINE Vec<Word, 16> max(
const Vec<Word, 16> &a,
1164 const Vec<Word, 16> &b)
1167 return _mm_max_epu16(a, b);
1170 const __m128i signbit = _mm_set1_epi32(0x80008000);
1171 const __m128i a1 = _mm_xor_si128(a, signbit);
1172 const __m128i b1 = _mm_xor_si128(b, signbit);
1173 const __m128i m1 = _mm_max_epi16(a1, b1);
1174 return _mm_xor_si128(m1, signbit);
1178static SIMD_INLINE Vec<Short, 16> max(
const Vec<Short, 16> &a,
1179 const Vec<Short, 16> &b)
1181 return _mm_max_epi16(a, b);
1184static SIMD_INLINE Vec<Int, 16> max(
const Vec<Int, 16> &a,
1185 const Vec<Int, 16> &b)
1188 return _mm_max_epi32(a, b);
1191 const __m128i gt = _mm_cmpgt_epi32(a, b);
1192 return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
1199static SIMD_INLINE Vec<Long, 16> max(
const Vec<Long, 16> &a,
1200 const Vec<Long, 16> &b)
1206 const __m128i gt = _mm_cmpgt_epi64(a, b);
1209 const __m128i diff = _mm_sub_epi64(b, a);
1211 const __m128i res = _mm_xor_si128(
1212 diff, _mm_and_si128(_mm_xor_si128(b, a), _mm_xor_si128(diff, b)));
1214 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
1215 _mm_andnot_si128(_mm_xor_si128(b, a), diff));
1219 const __m128i spread32 = _mm_srai_epi32(res, 31);
1220 const __m128i gt = _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1225 return _mm_blendv_epi8(b, a, gt);
1227 return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
1231static SIMD_INLINE Vec<Float, 16> max(
const Vec<Float, 16> &a,
1232 const Vec<Float, 16> &b)
1234 return _mm_max_ps(a, b);
1237static SIMD_INLINE Vec<Double, 16> max(
const Vec<Double, 16> &a,
1238 const Vec<Double, 16> &b)
1240 return _mm_max_pd(a, b);
1250static SIMD_INLINE Vec<Float, 16> mul(
const Vec<Float, 16> &a,
1251 const Vec<Float, 16> &b)
1253 return _mm_mul_ps(a, b);
1256static SIMD_INLINE Vec<Double, 16> mul(
const Vec<Double, 16> &a,
1257 const Vec<Double, 16> &b)
1259 return _mm_mul_pd(a, b);
1262static SIMD_INLINE Vec<Float, 16> div(
const Vec<Float, 16> &a,
1263 const Vec<Float, 16> &b)
1265 return _mm_div_ps(a, b);
1268static SIMD_INLINE Vec<Double, 16> div(
const Vec<Double, 16> &a,
1269 const Vec<Double, 16> &b)
1271 return _mm_div_pd(a, b);
1301template <
typename T>
1302static SIMD_INLINE Vec<T, 16> ceil(
const Vec<T, 16> &a)
1304 static_assert(std::is_integral<T>::value,
"");
1308template <
typename T>
1309static SIMD_INLINE Vec<T, 16> floor(
const Vec<T, 16> &a)
1311 static_assert(std::is_integral<T>::value,
"");
1315template <
typename T>
1316static SIMD_INLINE Vec<T, 16> round(
const Vec<T, 16> &a)
1318 static_assert(std::is_integral<T>::value,
"");
1322template <
typename T>
1323static SIMD_INLINE Vec<T, 16> truncate(
const Vec<T, 16> &a)
1325 static_assert(std::is_integral<T>::value,
"");
1329static SIMD_INLINE Vec<Float, 16> ceil(
const Vec<Float, 16> &a)
1332 return _mm_ceil_ps(a);
1335 const __m128 limit = _mm_set1_ps(8388608.f);
1338 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1339 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1341 const __m128 isNeg =
1342 _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1344 __m128 aTrunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
1346 const __m128 isNotInt = _mm_cmpneq_ps(a, aTrunc);
1348 const __m128 one = _mm_set1_ps(1.0f);
1350 const __m128 oneMask = _mm_and_ps(_mm_andnot_ps(isNeg, isNotInt), one);
1353 aTrunc = _mm_add_ps(aTrunc, oneMask);
1355 return ifelse(noRndReq, a, aTrunc);
1359static SIMD_INLINE Vec<Double, 16> ceil(
const Vec<Double, 16> &a)
1362 return _mm_ceil_pd(a);
1369 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1370 _mm_store_pd(inArr, a);
1371 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1372 outArr[0] = std::ceil(inArr[0]);
1373 outArr[1] = std::ceil(inArr[1]);
1374 return _mm_load_pd(outArr);
1378static SIMD_INLINE Vec<Float, 16> floor(
const Vec<Float, 16> &a)
1381 return _mm_floor_ps(a);
1384 const __m128 limit = _mm_set1_ps(8388608.f);
1387 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1388 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1390 const __m128 isNeg =
1391 _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1393 __m128 aTrunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
1395 const __m128 isNotInt = _mm_cmpneq_ps(a, aTrunc);
1397 const __m128 one = _mm_set1_ps(1.0f);
1399 const __m128 oneMask = _mm_and_ps(_mm_and_ps(isNeg, isNotInt), one);
1402 aTrunc = _mm_sub_ps(aTrunc, oneMask);
1404 return ifelse(noRndReq, a, aTrunc);
1408static SIMD_INLINE Vec<Double, 16> floor(
const Vec<Double, 16> &a)
1411 return _mm_floor_pd(a);
1418 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1419 _mm_store_pd(inArr, a);
1420 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1421 outArr[0] = std::floor(inArr[0]);
1422 outArr[1] = std::floor(inArr[1]);
1423 return _mm_load_pd(outArr);
1427static SIMD_INLINE Vec<Float, 16> round(
const Vec<Float, 16> &a)
1433 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1437 const __m128 limit = _mm_set1_ps(8388608.f);
1440 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1441 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1444 const __m128 aRnd = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
1446 return ifelse(noRndReq, a, aRnd);
1450static SIMD_INLINE Vec<Double, 16> round(
const Vec<Double, 16> &a)
1453 return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1460 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1461 _mm_store_pd(inArr, a);
1462 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1464 outArr[0] = std::rint(inArr[0]);
1465 outArr[1] = std::rint(inArr[1]);
1466 return _mm_load_pd(outArr);
1470static SIMD_INLINE Vec<Float, 16> truncate(
const Vec<Float, 16> &a)
1473 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1476 const __m128 limit = _mm_set1_ps(8388608.f);
1479 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1480 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1482 const __m128 aTrunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
1484 return ifelse(noRndReq, a, aTrunc);
1488static SIMD_INLINE Vec<Double, 16> truncate(
const Vec<Double, 16> &a)
1491 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1498 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1499 _mm_store_pd(inArr, a);
1500 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1501 outArr[0] = std::trunc(inArr[0]);
1502 outArr[1] = std::trunc(inArr[1]);
1503 return _mm_load_pd(outArr);
1512static SIMD_INLINE Vec<Float, 16> rcp(
const Vec<Float, 16> &a)
1514 return _mm_rcp_ps(a);
1518static SIMD_INLINE Vec<Double, 16> rcp(
const Vec<Double, 16> &a)
1521 return _mm_div_pd(_mm_set1_pd(1.0), a);
1525static SIMD_INLINE Vec<Float, 16> rsqrt(
const Vec<Float, 16> &a)
1527 return _mm_rsqrt_ps(a);
1531static SIMD_INLINE Vec<Double, 16> rsqrt(
const Vec<Double, 16> &a)
1534 return _mm_div_pd(_mm_set1_pd(1.0), _mm_sqrt_pd(a));
1538static SIMD_INLINE Vec<Float, 16> sqrt(
const Vec<Float, 16> &a)
1540 return _mm_sqrt_ps(a);
1544static SIMD_INLINE Vec<Double, 16> sqrt(
const Vec<Double, 16> &a)
1546 return _mm_sqrt_pd(a);
1556template <
typename T, SIMD_ENABLE_IF(std::is_
unsigned<T>::value
1557 &&std::is_
integral<T>::value)>
1558static SIMD_INLINE Vec<T, 16> abs(
const Vec<T, 16> &a)
1563static SIMD_INLINE Vec<SignedByte, 16> abs(
const Vec<SignedByte, 16> &a)
1565 return _mm_abs_epi8(a);
1568static SIMD_INLINE Vec<Short, 16> abs(
const Vec<Short, 16> &a)
1570 return _mm_abs_epi16(a);
1573static SIMD_INLINE Vec<Int, 16> abs(
const Vec<Int, 16> &a)
1575 return _mm_abs_epi32(a);
1578static SIMD_INLINE Vec<Long, 16> abs(
const Vec<Long, 16> &a)
1582 const __m128i signMask =
1583 _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1));
1584 return _mm_sub_epi64(_mm_xor_si128(a, signMask), signMask);
1587static SIMD_INLINE Vec<Float, 16> abs(
const Vec<Float, 16> &a)
1589 return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1592static SIMD_INLINE Vec<Double, 16> abs(
const Vec<Double, 16> &a)
1594 return _mm_and_pd(a, _mm_castsi128_pd(_mm_set1_epi64x(0x7FFFFFFFFFFFFFFF)));
1602template <
typename T>
1603static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1606 return _mm_unpacklo_epi8(a, b);
1610template <
typename T>
1611static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1614 return _mm_unpacklo_epi16(a, b);
1618template <
typename T>
1619static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1622 return _mm_unpacklo_epi32(a, b);
1626template <
typename T>
1627static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1630 return _mm_unpacklo_epi64(a, b);
1634static SIMD_INLINE Vec<Float, 16> unpack(
const Vec<Float, 16> &a,
1635 const Vec<Float, 16> &b, Part<0>,
1638 return _mm_unpacklo_ps(a, b);
1642static SIMD_INLINE Vec<Float, 16> unpack(
const Vec<Float, 16> &a,
1643 const Vec<Float, 16> &b, Part<0>,
1647 return _mm_movelh_ps(a, b);
1651static SIMD_INLINE Vec<Double, 16> unpack(
const Vec<Double, 16> &a,
1652 const Vec<Double, 16> &b, Part<0>,
1655 return _mm_unpacklo_pd(a, b);
1663template <
typename T>
1664static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1667 return _mm_unpackhi_epi8(a, b);
1671template <
typename T>
1672static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1675 return _mm_unpackhi_epi16(a, b);
1679template <
typename T>
1680static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1683 return _mm_unpackhi_epi32(a, b);
1687template <
typename T>
1688static SIMD_INLINE Vec<T, 16> unpack(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1691 return _mm_unpackhi_epi64(a, b);
1695static SIMD_INLINE Vec<Float, 16> unpack(
const Vec<Float, 16> &a,
1696 const Vec<Float, 16> &b, Part<1>,
1699 return _mm_unpackhi_ps(a, b);
1703static SIMD_INLINE Vec<Float, 16> unpack(
const Vec<Float, 16> &a,
1704 const Vec<Float, 16> &b, Part<1>,
1709 return _mm_movehl_ps(b, a);
1713static SIMD_INLINE Vec<Double, 16> unpack(
const Vec<Double, 16> &a,
1714 const Vec<Double, 16> &b, Part<1>,
1717 return _mm_unpackhi_pd(a, b);
1726template <
size_t PART,
size_t BYTES,
typename T>
1727static SIMD_INLINE Vec<T, 16> unpack16(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
1728 Part<PART>, Bytes<BYTES>)
1730 return unpack(a, b, Part<PART>(), Bytes<BYTES>());
1739template <
size_t LANE_INDEX,
typename T>
1740static SIMD_INLINE Vec<T, 16> extractLane(
const Vec<T, 16> &a)
1752template <
size_t NUM_ELEMS,
typename T>
1753static SIMD_INLINE
void zip(
const Vec<T, 16> a,
const Vec<T, 16> b,
1754 Vec<T, 16> &l, Vec<T, 16> &h)
1756 l = unpack(a, b, Part<0>(), Bytes<NUM_ELEMS *
sizeof(T)>());
1757 h = unpack(a, b, Part<1>(), Bytes<NUM_ELEMS *
sizeof(T)>());
1769template <
size_t NUM_ELEMS,
typename T>
1770static SIMD_INLINE
void zip16(
const Vec<T, 16> a,
const Vec<T, 16> b,
1771 Vec<T, 16> &l, Vec<T, 16> &h)
1773 zip<NUM_ELEMS, T>(a, b, l, h);
1786template <
typename T>
1787static SIMD_INLINE
void unzip(
const Vec<T, 16> a,
const Vec<T, 16> b,
1788 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<1>)
1791 const __m128i mask =
1792 _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
1793 const __m128i atmp = _mm_shuffle_epi8(a, mask);
1794 const __m128i btmp = _mm_shuffle_epi8(b, mask);
1795 l = _mm_unpacklo_epi64(atmp, btmp);
1796 h = _mm_unpackhi_epi64(atmp, btmp);
1800template <
typename T>
1801static SIMD_INLINE
void unzip(
const Vec<T, 16> a,
const Vec<T, 16> b,
1802 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<2>)
1805 const __m128i mask =
1806 _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
1807 const __m128i atmp = _mm_shuffle_epi8(a, mask);
1808 const __m128i btmp = _mm_shuffle_epi8(b, mask);
1809 l = _mm_unpacklo_epi64(atmp, btmp);
1810 h = _mm_unpackhi_epi64(atmp, btmp);
1814template <
typename T>
1815static SIMD_INLINE
void unzip(
const Vec<T, 16> a,
const Vec<T, 16> b,
1816 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<4>)
1818 const __m128 aps = _mm_castsi128_ps(a);
1819 const __m128 bps = _mm_castsi128_ps(b);
1820 l = _mm_castps_si128(_mm_shuffle_ps(aps, bps, _MM_SHUFFLE(2, 0, 2, 0)));
1821 h = _mm_castps_si128(_mm_shuffle_ps(aps, bps, _MM_SHUFFLE(3, 1, 3, 1)));
1825template <
typename T>
1826static SIMD_INLINE
void unzip(
const Vec<T, 16> a,
const Vec<T, 16> b,
1827 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<8>)
1829 l = unpack(a, b, Part<0>(), Bytes<8>());
1830 h = unpack(a, b, Part<1>(), Bytes<8>());
1834static SIMD_INLINE
void unzip(
const Vec<Float, 16> a,
const Vec<Float, 16> b,
1835 Vec<Float, 16> &l, Vec<Float, 16> &h, Bytes<4>)
1837 l = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
1838 h = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
1847static SIMD_INLINE Vec<SignedByte, 16> packs(
const Vec<Short, 16> &a,
1848 const Vec<Short, 16> &b,
1849 OutputType<SignedByte>)
1851 return _mm_packs_epi16(a, b);
1854static SIMD_INLINE Vec<Short, 16> packs(
const Vec<Int, 16> &a,
1855 const Vec<Int, 16> &b,
1858 return _mm_packs_epi32(a, b);
1861static SIMD_INLINE Vec<Short, 16> packs(
const Vec<Float, 16> &a,
1862 const Vec<Float, 16> &b,
1865 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
1866 OutputType<Short>());
1869static SIMD_INLINE Vec<Float, 16> packs(
const Vec<Long, 16> &a,
1870 const Vec<Long, 16> &b,
1874 return _mm_shuffle_ps(_mm_cvtpd_ps(cvts(a, OutputType<Double>())),
1875 _mm_cvtpd_ps(cvts(b, OutputType<Double>())),
1876 _MM_SHUFFLE(1, 0, 1, 0));
1879static SIMD_INLINE Vec<Float, 16> packs(
const Vec<Double, 16> &a,
1880 const Vec<Double, 16> &b,
1883 return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b),
1884 _MM_SHUFFLE(1, 0, 1, 0));
1889static SIMD_INLINE Vec<Int, 16> packs(
const Vec<Long, 16> &a,
1890 const Vec<Long, 16> &b, OutputType<Int>)
1897 Long input[4] SIMD_ATTR_ALIGNED(16);
1898 _mm_store_si128((__m128i *) input, a);
1899 _mm_store_si128((__m128i *) (input + 2), b);
1900 Int output[4] SIMD_ATTR_ALIGNED(16);
1901 for (
int i = 0; i < 4; ++i) {
1903 (
Int) std::min(std::max(input[i], (Long) std::numeric_limits<Int>::min()),
1904 (Long) std::numeric_limits<Int>::max());
1906 return _mm_load_si128((__m128i *) output);
1911static SIMD_INLINE Vec<Int, 16> packs(
const Vec<Double, 16> &a,
1912 const Vec<Double, 16> &b, OutputType<Int>)
1914 const __m128d clip = _mm_set1_pd(std::numeric_limits<Int>::max());
1915 const __m128 bI = _mm_castsi128_ps(_mm_cvtpd_epi32(_mm_min_pd(clip, b)));
1916 const __m128 aI = _mm_castsi128_ps(_mm_cvtpd_epi32(_mm_min_pd(clip, a)));
1917 return _mm_castps_si128(_mm_shuffle_ps(aI, bI, _MM_SHUFFLE(1, 0, 1, 0)));
1922static SIMD_INLINE Vec<Byte, 16> packs(
const Vec<Word, 16> &a,
1923 const Vec<Word, 16> &b, OutputType<Byte>)
1927 return _mm_packus_epi16(min(a, Vec<Word, 16>(_mm_set1_epi16(0xff))),
1928 min(b, Vec<Word, 16>(_mm_set1_epi16(0xff))));
1933static SIMD_INLINE Vec<Byte, 16> packs(
const Vec<Short, 16> &a,
1934 const Vec<Short, 16> &b,
1937 return _mm_packus_epi16(a, b);
1940static SIMD_INLINE Vec<Word, 16> packs(
const Vec<Int, 16> &a,
1941 const Vec<Int, 16> &b, OutputType<Word>)
1944 return _mm_packus_epi32(a, b);
1947 const __m128i mask = _mm_set1_epi32(0x0000ffff);
1950 __m128i asat = _mm_andnot_si128(_mm_srai_epi32(a, 31), a);
1987 asat = _mm_srai_epi32(
1988 _mm_or_si128(_mm_slli_epi32(asat, 16), _mm_cmpgt_epi32(asat, mask)), 16);
1991 __m128i bsat = _mm_andnot_si128(_mm_srai_epi32(b, 31), b);
1992 bsat = _mm_srai_epi32(
1993 _mm_or_si128(_mm_slli_epi32(bsat, 16), _mm_cmpgt_epi32(bsat, mask)), 16);
1994 return _mm_packs_epi32(asat, bsat);
1998static SIMD_INLINE Vec<Word, 16> packs(
const Vec<Float, 16> &a,
1999 const Vec<Float, 16> &b,
2002 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2003 OutputType<Word>());
2008static SIMD_INLINE Vec<SignedByte, 16> packs(
const Vec<Word, 16> &a,
2009 const Vec<Word, 16> &b,
2010 OutputType<SignedByte>)
2014 return _mm_packs_epi16(min(a, Vec<Word, 16>(_mm_set1_epi16(0x7f))),
2015 min(b, Vec<Word, 16>(_mm_set1_epi16(0x7f))));
2029template <
typename T>
2030static SIMD_INLINE
void extend(
const Vec<T, 16> &vIn, Vec<T, 16> vOut[1])
2037static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2038 Vec<Byte, 16> vOut[1])
2040 vOut[0] = max(vIn, _mm_setzero_si128());
2043static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn,
2044 Vec<SignedByte, 16> vOut[1])
2046 vOut[0] = min(vIn, _mm_set1_epi8(0x7f));
2049static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn, Vec<Word, 16> vOut[1])
2051 vOut[0] = _mm_max_epi16(vIn, _mm_setzero_si128());
2054static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Short, 16> vOut[1])
2056 vOut[0] = min(vIn, _mm_set1_epi16(0x7fff));
2065static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2066 Vec<Short, 16> vOut[2])
2069 vOut[0] = _mm_cvtepi8_epi16(vIn);
2070 vOut[1] = _mm_cvtepi8_epi16(_mm_srli_si128(vIn, 8));
2072 vOut[0] = _mm_srai_epi16(_mm_unpacklo_epi8(_mm_undefined_si128(), vIn), 8);
2073 vOut[1] = _mm_srai_epi16(_mm_unpackhi_epi8(_mm_undefined_si128(), vIn), 8);
2077static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn, Vec<Int, 16> vOut[2])
2080 vOut[0] = _mm_cvtepi16_epi32(vIn);
2081 vOut[1] = _mm_cvtepi16_epi32(_mm_srli_si128(vIn, 8));
2083 vOut[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_undefined_si128(), vIn), 16);
2084 vOut[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_undefined_si128(), vIn), 16);
2088static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn,
2089 Vec<Float, 16> vOut[2])
2092 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(vIn));
2093 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 8)));
2095 vOut[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(vIn, vIn), 16));
2096 vOut[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(vIn, vIn), 16));
2102static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Word, 16> vOut[2])
2105 vOut[0] = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2106 vOut[1] = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2111static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Short, 16> vOut[2])
2114 vOut[0] = _mm_cvtepu8_epi16(vIn);
2115 vOut[1] = _mm_cvtepu8_epi16(_mm_srli_si128(vIn, 8));
2117 vOut[0] = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2118 vOut[1] = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2122static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Int, 16> vOut[2])
2125 vOut[0] = _mm_cvtepu16_epi32(vIn);
2126 vOut[1] = _mm_cvtepu16_epi32(_mm_srli_si128(vIn, 8));
2128 vOut[0] = _mm_unpacklo_epi16(vIn, _mm_setzero_si128());
2129 vOut[1] = _mm_unpackhi_epi16(vIn, _mm_setzero_si128());
2133static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Float, 16> vOut[2])
2136 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(vIn));
2137 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 8)));
2139 vOut[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(vIn, _mm_setzero_si128()));
2140 vOut[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(vIn, _mm_setzero_si128()));
2144static SIMD_INLINE
void extend(
const Vec<Int, 16> &vIn, Vec<Long, 16> vOut[2])
2147 vOut[0] = _mm_cvtepi32_epi64(vIn);
2148 vOut[1] = _mm_cvtepi32_epi64(_mm_srli_si128(vIn, 8));
2150 const __m128i sign = _mm_srai_epi32(vIn, 31);
2151 vOut[0] = _mm_unpacklo_epi32(vIn, sign);
2152 vOut[1] = _mm_unpackhi_epi32(vIn, sign);
2156static SIMD_INLINE
void extend(
const Vec<Int, 16> &vIn, Vec<Double, 16> vOut[2])
2158 vOut[0] = _mm_cvtepi32_pd(vIn);
2159 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(vIn, 8));
2162static SIMD_INLINE
void extend(
const Vec<Float, 16> &vIn, Vec<Long, 16> vOut[2])
2164 const auto clipped =
2165 _mm_min_ps(vIn, _mm_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT64));
2166 vOut[0] = cvts(_mm_cvtps_pd(clipped), OutputType<Long>());
2167 vOut[1] = cvts(_mm_cvtps_pd(_mm_castsi128_ps(
2168 _mm_srli_si128(_mm_castps_si128(clipped), 8))),
2169 OutputType<Long>());
2172static SIMD_INLINE
void extend(
const Vec<Float, 16> &vIn,
2173 Vec<Double, 16> vOut[2])
2175 vOut[0] = _mm_cvtps_pd(vIn);
2177 _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(vIn), 8)));
2182static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2183 Vec<Word, 16> vOut[2])
2187 const __m128i vInPos = _mm_max_epi8(vIn, _mm_setzero_si128());
2190 const __m128i signbit = _mm_set1_epi32(0x80808080);
2191 const __m128i a1 = _mm_xor_si128(vIn, signbit);
2192 const __m128i m1 = _mm_max_epu8(a1, signbit);
2193 const __m128i vInPos = _mm_xor_si128(m1, signbit);
2195 vOut[0] = _mm_unpacklo_epi8(vInPos, _mm_setzero_si128());
2196 vOut[1] = _mm_unpackhi_epi8(vInPos, _mm_setzero_si128());
2205static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2206 Vec<Int, 16> vOut[4])
2209 vOut[0] = _mm_cvtepi8_epi32(vIn);
2210 vOut[1] = _mm_cvtepi8_epi32(_mm_srli_si128(vIn, 4));
2211 vOut[2] = _mm_cvtepi8_epi32(_mm_srli_si128(vIn, 8));
2212 vOut[3] = _mm_cvtepi8_epi32(_mm_srli_si128(vIn, 12));
2214 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2215 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2216 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2217 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2218 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2219 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2220 vOut[0] = _mm_srai_epi32(lolo16, 24);
2221 vOut[1] = _mm_srai_epi32(lohi16, 24);
2222 vOut[2] = _mm_srai_epi32(hilo16, 24);
2223 vOut[3] = _mm_srai_epi32(hihi16, 24);
2227static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2228 Vec<Float, 16> vOut[4])
2231 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(vIn));
2232 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 4)));
2233 vOut[2] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 8)));
2234 vOut[3] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 12)));
2236 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2237 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2238 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2239 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2240 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2241 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2242 vOut[0] = _mm_cvtepi32_ps(_mm_srai_epi32(lolo16, 24));
2243 vOut[1] = _mm_cvtepi32_ps(_mm_srai_epi32(lohi16, 24));
2244 vOut[2] = _mm_cvtepi32_ps(_mm_srai_epi32(hilo16, 24));
2245 vOut[3] = _mm_cvtepi32_ps(_mm_srai_epi32(hihi16, 24));
2249static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn, Vec<Long, 16> vOut[4])
2252 vOut[0] = _mm_cvtepi16_epi64(vIn);
2253 vOut[1] = _mm_cvtepi16_epi64(_mm_srli_si128(vIn, 4));
2254 vOut[2] = _mm_cvtepi16_epi64(_mm_srli_si128(vIn, 8));
2255 vOut[3] = _mm_cvtepi16_epi64(_mm_srli_si128(vIn, 12));
2257 const __m128i lo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), vIn);
2258 const __m128i hi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), vIn);
2259 const __m128i lo16ext = _mm_srai_epi32(lo16, 16);
2260 const __m128i hi16ext = _mm_srai_epi32(hi16, 16);
2261 const __m128i lo16sign = _mm_srai_epi32(lo16, 31);
2262 const __m128i hi16sign = _mm_srai_epi32(hi16, 31);
2263 vOut[0] = _mm_unpacklo_epi32(lo16ext, lo16sign);
2264 vOut[1] = _mm_unpackhi_epi32(lo16ext, lo16sign);
2265 vOut[2] = _mm_unpacklo_epi32(hi16ext, hi16sign);
2266 vOut[3] = _mm_unpackhi_epi32(hi16ext, hi16sign);
2270static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn,
2271 Vec<Double, 16> vOut[4])
2274 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(vIn));
2275 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 4)));
2276 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 8)));
2277 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 12)));
2279 const __m128i lo16 =
2280 _mm_srai_epi32(_mm_unpacklo_epi16(_mm_undefined_si128(), vIn), 16);
2281 const __m128i hi16 =
2282 _mm_srai_epi32(_mm_unpackhi_epi16(_mm_undefined_si128(), vIn), 16);
2283 vOut[0] = _mm_cvtepi32_pd(lo16);
2284 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lo16, 8));
2285 vOut[2] = _mm_cvtepi32_pd(hi16);
2286 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(hi16, 8));
2292static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Int, 16> vOut[4])
2295 vOut[0] = _mm_cvtepu8_epi32(vIn);
2296 vOut[1] = _mm_cvtepu8_epi32(_mm_srli_si128(vIn, 4));
2297 vOut[2] = _mm_cvtepu8_epi32(_mm_srli_si128(vIn, 8));
2298 vOut[3] = _mm_cvtepu8_epi32(_mm_srli_si128(vIn, 12));
2300 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2301 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2302 vOut[0] = _mm_unpacklo_epi16(lo8, _mm_setzero_si128());
2303 vOut[1] = _mm_unpackhi_epi16(lo8, _mm_setzero_si128());
2304 vOut[2] = _mm_unpacklo_epi16(hi8, _mm_setzero_si128());
2305 vOut[3] = _mm_unpackhi_epi16(hi8, _mm_setzero_si128());
2309static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Float, 16> vOut[4])
2312 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(vIn));
2313 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 4)));
2314 vOut[2] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 8)));
2315 vOut[3] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 12)));
2317 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2318 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2319 vOut[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo8, _mm_setzero_si128()));
2320 vOut[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo8, _mm_setzero_si128()));
2321 vOut[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi8, _mm_setzero_si128()));
2322 vOut[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi8, _mm_setzero_si128()));
2326static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Long, 16> vOut[4])
2329 vOut[0] = _mm_cvtepu16_epi64(vIn);
2330 vOut[1] = _mm_cvtepu16_epi64(_mm_srli_si128(vIn, 4));
2331 vOut[2] = _mm_cvtepu16_epi64(_mm_srli_si128(vIn, 8));
2332 vOut[3] = _mm_cvtepu16_epi64(_mm_srli_si128(vIn, 12));
2334 const __m128i lo16 = _mm_unpacklo_epi16(vIn, _mm_setzero_si128());
2335 const __m128i hi16 = _mm_unpackhi_epi16(vIn, _mm_setzero_si128());
2336 vOut[0] = _mm_unpacklo_epi32(lo16, _mm_setzero_si128());
2337 vOut[1] = _mm_unpackhi_epi32(lo16, _mm_setzero_si128());
2338 vOut[2] = _mm_unpacklo_epi32(hi16, _mm_setzero_si128());
2339 vOut[3] = _mm_unpackhi_epi32(hi16, _mm_setzero_si128());
2343static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn,
2344 Vec<Double, 16> vOut[4])
2347 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(vIn));
2348 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 4)));
2349 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 8)));
2350 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 12)));
2352 const __m128i lo16 = _mm_unpacklo_epi16(vIn, _mm_setzero_si128());
2353 const __m128i hi16 = _mm_unpackhi_epi16(vIn, _mm_setzero_si128());
2354 vOut[0] = _mm_cvtepi32_pd(lo16);
2355 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lo16, 8));
2356 vOut[2] = _mm_cvtepi32_pd(hi16);
2357 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(hi16, 8));
2367static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2368 Vec<Long, 16> vOut[8])
2371 vOut[0] = _mm_cvtepi8_epi64(vIn);
2372 vOut[1] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 2));
2373 vOut[2] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 4));
2374 vOut[3] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 6));
2375 vOut[4] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 8));
2376 vOut[5] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 10));
2377 vOut[6] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 12));
2378 vOut[7] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 14));
2380 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2381 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2382 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2383 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2384 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2385 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2386 const __m128i lolo16ext = _mm_srai_epi32(lolo16, 24);
2387 const __m128i lohi16ext = _mm_srai_epi32(lohi16, 24);
2388 const __m128i hilo16ext = _mm_srai_epi32(hilo16, 24);
2389 const __m128i hihi16ext = _mm_srai_epi32(hihi16, 24);
2390 const __m128i lolo16sign = _mm_srai_epi32(lolo16, 31);
2391 const __m128i lohi16sign = _mm_srai_epi32(lohi16, 31);
2392 const __m128i hilo16sign = _mm_srai_epi32(hilo16, 31);
2393 const __m128i hihi16sign = _mm_srai_epi32(hihi16, 31);
2394 vOut[0] = _mm_unpacklo_epi32(lolo16ext, lolo16sign);
2395 vOut[1] = _mm_unpackhi_epi32(lolo16ext, lolo16sign);
2396 vOut[2] = _mm_unpacklo_epi32(lohi16ext, lohi16sign);
2397 vOut[3] = _mm_unpackhi_epi32(lohi16ext, lohi16sign);
2398 vOut[4] = _mm_unpacklo_epi32(hilo16ext, hilo16sign);
2399 vOut[5] = _mm_unpackhi_epi32(hilo16ext, hilo16sign);
2400 vOut[6] = _mm_unpacklo_epi32(hihi16ext, hihi16sign);
2401 vOut[7] = _mm_unpackhi_epi32(hihi16ext, hihi16sign);
2405static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
2406 Vec<Double, 16> vOut[8])
2409 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(vIn));
2410 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 2)));
2411 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 4)));
2412 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 6)));
2413 vOut[4] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 8)));
2414 vOut[5] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 10)));
2415 vOut[6] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 12)));
2416 vOut[7] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 14)));
2418 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2419 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2420 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2421 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2422 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2423 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2424 const __m128i lolo16ext = _mm_srai_epi32(lolo16, 24);
2425 const __m128i lohi16ext = _mm_srai_epi32(lohi16, 24);
2426 const __m128i hilo16ext = _mm_srai_epi32(hilo16, 24);
2427 const __m128i hihi16ext = _mm_srai_epi32(hihi16, 24);
2428 vOut[0] = _mm_cvtepi32_pd(lolo16ext);
2429 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lolo16ext, 8));
2430 vOut[2] = _mm_cvtepi32_pd(lohi16ext);
2431 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(lohi16ext, 8));
2432 vOut[4] = _mm_cvtepi32_pd(hilo16ext);
2433 vOut[5] = _mm_cvtepi32_pd(_mm_srli_si128(hilo16ext, 8));
2434 vOut[6] = _mm_cvtepi32_pd(hihi16ext);
2435 vOut[7] = _mm_cvtepi32_pd(_mm_srli_si128(hihi16ext, 8));
2441static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Long, 16> vOut[8])
2444 vOut[0] = _mm_cvtepu8_epi64(vIn);
2445 vOut[1] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 2));
2446 vOut[2] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 4));
2447 vOut[3] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 6));
2448 vOut[4] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 8));
2449 vOut[5] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 10));
2450 vOut[6] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 12));
2451 vOut[7] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 14));
2453 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2454 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2455 const __m128i lolo16 = _mm_unpacklo_epi16(lo8, _mm_setzero_si128());
2456 const __m128i lohi16 = _mm_unpackhi_epi16(lo8, _mm_setzero_si128());
2457 const __m128i hilo16 = _mm_unpacklo_epi16(hi8, _mm_setzero_si128());
2458 const __m128i hihi16 = _mm_unpackhi_epi16(hi8, _mm_setzero_si128());
2459 vOut[0] = _mm_unpacklo_epi32(lolo16, _mm_setzero_si128());
2460 vOut[1] = _mm_unpackhi_epi32(lolo16, _mm_setzero_si128());
2461 vOut[2] = _mm_unpacklo_epi32(lohi16, _mm_setzero_si128());
2462 vOut[3] = _mm_unpackhi_epi32(lohi16, _mm_setzero_si128());
2463 vOut[4] = _mm_unpacklo_epi32(hilo16, _mm_setzero_si128());
2464 vOut[5] = _mm_unpackhi_epi32(hilo16, _mm_setzero_si128());
2465 vOut[6] = _mm_unpacklo_epi32(hihi16, _mm_setzero_si128());
2466 vOut[7] = _mm_unpackhi_epi32(hihi16, _mm_setzero_si128());
2470static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn,
2471 Vec<Double, 16> vOut[8])
2474 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(vIn));
2475 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 2)));
2476 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 4)));
2477 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 6)));
2478 vOut[4] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 8)));
2479 vOut[5] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 10)));
2480 vOut[6] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 12)));
2481 vOut[7] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 14)));
2483 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2484 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2485 const __m128i lolo16 = _mm_unpacklo_epi16(lo8, _mm_setzero_si128());
2486 const __m128i lohi16 = _mm_unpackhi_epi16(lo8, _mm_setzero_si128());
2487 const __m128i hilo16 = _mm_unpacklo_epi16(hi8, _mm_setzero_si128());
2488 const __m128i hihi16 = _mm_unpackhi_epi16(hi8, _mm_setzero_si128());
2489 vOut[0] = _mm_cvtepi32_pd(lolo16);
2490 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lolo16, 8));
2491 vOut[2] = _mm_cvtepi32_pd(lohi16);
2492 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(lohi16, 8));
2493 vOut[4] = _mm_cvtepi32_pd(hilo16);
2494 vOut[5] = _mm_cvtepi32_pd(_mm_srli_si128(hilo16, 8));
2495 vOut[6] = _mm_cvtepi32_pd(hihi16);
2496 vOut[7] = _mm_cvtepi32_pd(_mm_srli_si128(hihi16, 8));
2504template <
typename Tout,
typename Tin,
2505 SIMD_ENABLE_IF(
sizeof(Tin) ==
sizeof(Tout) &&
2506 std::is_floating_point<Tin>::value !=
2507 std::is_floating_point<Tout>::value)>
2508static SIMD_INLINE
void extend(
const Vec<Tin, 16> &vIn, Vec<Tout, 16> vOut[1])
2510 vOut[0] = cvts(vIn, OutputType<Tout>());
2519template <
size_t COUNT>
2520static SIMD_INLINE Vec<Byte, 16> srai(
const Vec<Byte, 16> &a)
2522 SIMD_IF_CONSTEXPR (COUNT < 8) {
2523 const __m128i odd = _mm_srai_epi16(a, COUNT);
2524 const __m128i even = _mm_srai_epi16(_mm_slli_epi16(a, 8), COUNT + 8);
2525 const __m128i odd_masked =
2526 _mm_and_si128(odd, _mm_set1_epi16((int16_t) 0xFF00));
2527 const __m128i even_masked = _mm_and_si128(even, _mm_set1_epi16(0x00FF));
2528 return _mm_or_si128(odd_masked, even_masked);
2531 return _mm_cmplt_epi8(a, _mm_setzero_si128());
2535template <
size_t COUNT>
2536static SIMD_INLINE Vec<SignedByte, 16> srai(
const Vec<SignedByte, 16> &a)
2538 return reinterpret(srai<COUNT>(reinterpret(a, OutputType<Byte>())),
2539 OutputType<SignedByte>());
2542template <
size_t COUNT>
2543static SIMD_INLINE Vec<Word, 16> srai(
const Vec<Word, 16> &a)
2545 return _mm_srai_epi16(a, vec::min(COUNT, 15ul));
2548template <
size_t COUNT>
2549static SIMD_INLINE Vec<Short, 16> srai(
const Vec<Short, 16> &a)
2551 return _mm_srai_epi16(a, vec::min(COUNT, 15ul));
2554template <
size_t COUNT>
2555static SIMD_INLINE Vec<Int, 16> srai(
const Vec<Int, 16> &a)
2557 return _mm_srai_epi32(a, vec::min(COUNT, 31ul));
2560template <
size_t COUNT>
2561static SIMD_INLINE Vec<Long, 16> srai(
const Vec<Long, 16> &a)
2566 const __m128i odd = _mm_srai_epi32(a, vec::min(COUNT, 31ul));
2568 SIMD_IF_CONSTEXPR (COUNT < 32) {
2569 even = _mm_or_si128(_mm_srli_epi32(a, COUNT),
2570 _mm_slli_epi32(_mm_srli_si128(a, 4), 32 - COUNT));
2572 even = _mm_srai_epi32(_mm_srli_si128(a, 4), vec::min(COUNT - 32, 31ul));
2575 return _mm_blend_epi16(even, odd, 0xcc);
2577 return _mm_or_si128(_mm_and_si128(even, _mm_set1_epi64x(0x00000000FFFFFFFF)),
2578 _mm_and_si128(odd, _mm_set1_epi64x(0xFFFFFFFF00000000)));
2588template <
size_t COUNT>
2589static SIMD_INLINE Vec<Byte, 16> srli(
const Vec<Byte, 16> &a)
2591 SIMD_IF_CONSTEXPR (COUNT < 8) {
2592 return _mm_and_si128(_mm_set1_epi8((int8_t) (0xff >> COUNT)),
2593 _mm_srli_epi32(a, COUNT));
2595 return _mm_setzero_si128();
2601template <
size_t COUNT>
2602static SIMD_INLINE Vec<SignedByte, 16> srli(
const Vec<SignedByte, 16> &a)
2604 SIMD_IF_CONSTEXPR (COUNT < 8) {
2605 return _mm_and_si128(_mm_set1_epi8((int8_t) (0xff >> COUNT)),
2606 _mm_srli_epi32(a, COUNT));
2608 return _mm_setzero_si128();
2612template <
size_t COUNT>
2613static SIMD_INLINE Vec<Word, 16> srli(
const Vec<Word, 16> &a)
2615 SIMD_IF_CONSTEXPR (COUNT < 16) {
2616 return _mm_srli_epi16(a, COUNT);
2618 return _mm_setzero_si128();
2622template <
size_t COUNT>
2623static SIMD_INLINE Vec<Short, 16> srli(
const Vec<Short, 16> &a)
2625 SIMD_IF_CONSTEXPR (COUNT < 16) {
2626 return _mm_srli_epi16(a, COUNT);
2628 return _mm_setzero_si128();
2632template <
size_t COUNT>
2633static SIMD_INLINE Vec<Int, 16> srli(
const Vec<Int, 16> &a)
2635 SIMD_IF_CONSTEXPR (COUNT < 32) {
2636 return _mm_srli_epi32(a, COUNT);
2638 return _mm_setzero_si128();
2642template <
size_t COUNT>
2643static SIMD_INLINE Vec<Long, 16> srli(
const Vec<Long, 16> &a)
2645 SIMD_IF_CONSTEXPR (COUNT < 64) {
2646 return _mm_srli_epi64(a, COUNT);
2648 return _mm_setzero_si128();
2658template <
size_t COUNT>
2659static SIMD_INLINE Vec<Byte, 16> slli(
const Vec<Byte, 16> &a)
2661 SIMD_IF_CONSTEXPR (COUNT < 8) {
2662 return _mm_and_si128(
2663 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2664 _mm_slli_epi32(a, COUNT));
2666 return _mm_setzero_si128();
2672template <
size_t COUNT>
2673static SIMD_INLINE Vec<SignedByte, 16> slli(
const Vec<SignedByte, 16> &a)
2675 SIMD_IF_CONSTEXPR (COUNT < 8) {
2676 return _mm_and_si128(
2677 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2678 _mm_slli_epi32(a, COUNT));
2680 return _mm_setzero_si128();
2684template <
size_t COUNT>
2685static SIMD_INLINE Vec<Word, 16> slli(
const Vec<Word, 16> &a)
2687 SIMD_IF_CONSTEXPR (COUNT < 16) {
2688 return _mm_slli_epi16(a, COUNT);
2690 return _mm_setzero_si128();
2694template <
size_t COUNT>
2695static SIMD_INLINE Vec<Short, 16> slli(
const Vec<Short, 16> &a)
2697 SIMD_IF_CONSTEXPR (COUNT < 16) {
2698 return _mm_slli_epi16(a, COUNT);
2700 return _mm_setzero_si128();
2704template <
size_t COUNT>
2705static SIMD_INLINE Vec<Int, 16> slli(
const Vec<Int, 16> &a)
2707 SIMD_IF_CONSTEXPR (COUNT < 32) {
2708 return _mm_slli_epi32(a, COUNT);
2710 return _mm_setzero_si128();
2714template <
size_t COUNT>
2715static SIMD_INLINE Vec<Long, 16> slli(
const Vec<Long, 16> &a)
2717 SIMD_IF_CONSTEXPR (COUNT < 64) {
2718 return _mm_slli_epi64(a, COUNT);
2720 return _mm_setzero_si128();
2730static SIMD_INLINE Vec<Byte, 16> sra(
const Vec<Byte, 16> &a,
2731 const uint8_t count)
2736 return _mm_cmplt_epi8(a, _mm_setzero_si128());
2738 __m128i odd = _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2740 _mm_sra_epi16(_mm_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
2741 return ifelse<Byte>(_mm_set1_epi16((int16_t) 0xFF00), odd, even);
2744static SIMD_INLINE Vec<SignedByte, 16> sra(
const Vec<SignedByte, 16> &a,
2745 const uint8_t count)
2750 return _mm_cmplt_epi8(a, _mm_setzero_si128());
2752 __m128i odd = _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2754 _mm_sra_epi16(_mm_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
2755 return ifelse<SignedByte>(_mm_set1_epi16((int16_t) 0xFF00), odd, even);
2758static SIMD_INLINE Vec<Word, 16> sra(
const Vec<Word, 16> &a,
2759 const uint8_t count)
2761 return _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2764static SIMD_INLINE Vec<Short, 16> sra(
const Vec<Short, 16> &a,
2765 const uint8_t count)
2767 return _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2770static SIMD_INLINE Vec<Int, 16> sra(
const Vec<Int, 16> &a,
const uint8_t count)
2772 return _mm_sra_epi32(a, _mm_cvtsi32_si128(count));
2775static SIMD_INLINE Vec<Long, 16> sra(
const Vec<Long, 16> &a,
2776 const uint8_t count)
2780 const __m128i odd = _mm_sra_epi32(a, _mm_cvtsi32_si128(count));
2783 even = _mm_or_si128(
2784 _mm_srl_epi32(a, _mm_cvtsi32_si128(count)),
2785 _mm_sll_epi32(_mm_srli_si128(a, 4), _mm_cvtsi32_si128(32 - count)));
2787 even = _mm_sra_epi32(_mm_srli_si128(a, 4), _mm_cvtsi32_si128(count - 32));
2790 return _mm_blend_epi16(even, odd, 0xcc);
2792 return _mm_or_si128(_mm_and_si128(even, _mm_set1_epi64x(0x00000000FFFFFFFF)),
2793 _mm_and_si128(odd, _mm_set1_epi64x(0xFFFFFFFF00000000)));
2801static SIMD_INLINE Vec<Byte, 16> srl(
const Vec<Byte, 16> &a,
2802 const uint8_t count)
2804 return _mm_and_si128(_mm_srl_epi16(a, _mm_cvtsi32_si128(count)),
2805 _mm_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
2808static SIMD_INLINE Vec<SignedByte, 16> srl(
const Vec<SignedByte, 16> &a,
2809 const uint8_t count)
2811 return _mm_and_si128(_mm_srl_epi16(a, _mm_cvtsi32_si128(count)),
2812 _mm_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
2815static SIMD_INLINE Vec<Word, 16> srl(
const Vec<Word, 16> &a,
2816 const uint8_t count)
2818 return _mm_srl_epi16(a, _mm_cvtsi32_si128(count));
2821static SIMD_INLINE Vec<Short, 16> srl(
const Vec<Short, 16> &a,
2822 const uint8_t count)
2824 return _mm_srl_epi16(a, _mm_cvtsi32_si128(count));
2827static SIMD_INLINE Vec<Int, 16> srl(
const Vec<Int, 16> &a,
const uint8_t count)
2829 return _mm_srl_epi32(a, _mm_cvtsi32_si128(count));
2832static SIMD_INLINE Vec<Long, 16> srl(
const Vec<Long, 16> &a,
2833 const uint8_t count)
2835 return _mm_srl_epi64(a, _mm_cvtsi32_si128(count));
2842static SIMD_INLINE Vec<Byte, 16> sll(
const Vec<Byte, 16> &a,
2843 const uint8_t count)
2845 return _mm_and_si128(
2846 _mm_sll_epi16(a, _mm_cvtsi32_si128(count)),
2847 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
2850static SIMD_INLINE Vec<SignedByte, 16> sll(
const Vec<SignedByte, 16> &a,
2851 const uint8_t count)
2853 return _mm_and_si128(
2854 _mm_sll_epi16(a, _mm_cvtsi32_si128(count)),
2855 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
2858static SIMD_INLINE Vec<Word, 16> sll(
const Vec<Word, 16> &a,
2859 const uint8_t count)
2861 return _mm_sll_epi16(a, _mm_cvtsi32_si128(count));
2864static SIMD_INLINE Vec<Short, 16> sll(
const Vec<Short, 16> &a,
2865 const uint8_t count)
2867 return _mm_sll_epi16(a, _mm_cvtsi32_si128(count));
2870static SIMD_INLINE Vec<Int, 16> sll(
const Vec<Int, 16> &a,
const uint8_t count)
2872 return _mm_sll_epi32(a, _mm_cvtsi32_si128(count));
2875static SIMD_INLINE Vec<Long, 16> sll(
const Vec<Long, 16> &a,
2876 const uint8_t count)
2878 return _mm_sll_epi64(a, _mm_cvtsi32_si128(count));
2889template <
typename T>
2890static SIMD_INLINE Vec<T, 16> hadd(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
2893 unzip(a, b, x, y, Bytes<
sizeof(T)>());
2897static SIMD_INLINE Vec<Word, 16> hadd(
const Vec<Word, 16> &a,
2898 const Vec<Word, 16> &b)
2900 return _mm_hadd_epi16(a, b);
2903static SIMD_INLINE Vec<Short, 16> hadd(
const Vec<Short, 16> &a,
2904 const Vec<Short, 16> &b)
2906 return _mm_hadd_epi16(a, b);
2909static SIMD_INLINE Vec<Int, 16> hadd(
const Vec<Int, 16> &a,
2910 const Vec<Int, 16> &b)
2912 return _mm_hadd_epi32(a, b);
2915static SIMD_INLINE Vec<Float, 16> hadd(
const Vec<Float, 16> &a,
2916 const Vec<Float, 16> &b)
2918 return _mm_hadd_ps(a, b);
2924static SIMD_INLINE Vec<Double, 16> hadd(
const Vec<Double, 16> &a,
2925 const Vec<Double, 16> &b)
2927 return _mm_hadd_pd(a, b);
2937template <
typename T>
2938static SIMD_INLINE Vec<T, 16> hadds(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
2941 unzip(a, b, x, y, Bytes<
sizeof(T)>());
2945static SIMD_INLINE Vec<Short, 16> hadds(
const Vec<Short, 16> &a,
2946 const Vec<Short, 16> &b)
2948 return _mm_hadds_epi16(a, b);
2952static SIMD_INLINE Vec<Float, 16> hadds(
const Vec<Float, 16> &a,
2953 const Vec<Float, 16> &b)
2955 return _mm_hadd_ps(a, b);
2961static SIMD_INLINE Vec<Double, 16> hadds(
const Vec<Double, 16> &a,
2962 const Vec<Double, 16> &b)
2964 return _mm_hadd_pd(a, b);
2972template <
typename T>
2973static SIMD_INLINE Vec<T, 16> hsub(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
2976 unzip(a, b, x, y, Bytes<
sizeof(T)>());
2980static SIMD_INLINE Vec<Word, 16> hsub(
const Vec<Word, 16> &a,
2981 const Vec<Word, 16> &b)
2983 return _mm_hsub_epi16(a, b);
2986static SIMD_INLINE Vec<Short, 16> hsub(
const Vec<Short, 16> &a,
2987 const Vec<Short, 16> &b)
2989 return _mm_hsub_epi16(a, b);
2992static SIMD_INLINE Vec<Int, 16> hsub(
const Vec<Int, 16> &a,
2993 const Vec<Int, 16> &b)
2995 return _mm_hsub_epi32(a, b);
2998static SIMD_INLINE Vec<Float, 16> hsub(
const Vec<Float, 16> &a,
2999 const Vec<Float, 16> &b)
3001 return _mm_hsub_ps(a, b);
3007static SIMD_INLINE Vec<Double, 16> hsub(
const Vec<Double, 16> &a,
3008 const Vec<Double, 16> &b)
3010 return _mm_hsub_pd(a, b);
3020template <
typename T>
3021static SIMD_INLINE Vec<T, 16> hsubs(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
3024 unzip(a, b, x, y, Bytes<
sizeof(T)>());
3028static SIMD_INLINE Vec<Short, 16> hsubs(
const Vec<Short, 16> &a,
3029 const Vec<Short, 16> &b)
3031 return _mm_hsubs_epi16(a, b);
3035static SIMD_INLINE Vec<Float, 16> hsubs(
const Vec<Float, 16> &a,
3036 const Vec<Float, 16> &b)
3038 return _mm_hsub_ps(a, b);
3044static SIMD_INLINE Vec<Double, 16> hsubs(
const Vec<Double, 16> &a,
3045 const Vec<Double, 16> &b)
3047 return _mm_hsub_pd(a, b);
3055template <
size_t COUNT,
typename T>
3056static SIMD_INLINE Vec<T, 16> srle(
const Vec<T, 16> &a)
3058 const auto intA = reinterpret(a, OutputType<Int>());
3059 const Vec<Int, 16> result =
3060 _mm_srli_si128(intA, vec::min(COUNT *
sizeof(T), 16lu));
3061 return reinterpret(result, OutputType<T>());
3069template <
size_t COUNT,
typename T>
3070static SIMD_INLINE Vec<T, 16> slle(
const Vec<T, 16> &a)
3072 const auto intA = reinterpret(a, OutputType<Int>());
3073 const Vec<Int, 16> result =
3074 _mm_slli_si128(intA, vec::min(COUNT *
sizeof(T), 16lu));
3075 return reinterpret(result, OutputType<T>());
3083template <
size_t COUNT,
typename T>
3084static SIMD_INLINE Vec<T, 16> alignre(
const Vec<T, 16> &h,
const Vec<T, 16> &l)
3086 SIMD_IF_CONSTEXPR (COUNT *
sizeof(T) < 32) {
3087 return _mm_alignr_epi8(h, l, COUNT *
sizeof(T));
3089 return _mm_setzero_si128();
3094template <
size_t COUNT>
3095static SIMD_INLINE Vec<Float, 16> alignre(
const Vec<Float, 16> &h,
3096 const Vec<Float, 16> &l)
3098 SIMD_IF_CONSTEXPR (COUNT *
sizeof(Float) < 32) {
3099 return _mm_castsi128_ps(_mm_alignr_epi8(
3100 _mm_castps_si128(h), _mm_castps_si128(l), COUNT *
sizeof(Float)));
3102 return _mm_setzero_ps();
3107template <
size_t COUNT>
3108static SIMD_INLINE Vec<Double, 16> alignre(
const Vec<Double, 16> &h,
3109 const Vec<Double, 16> &l)
3111 SIMD_IF_CONSTEXPR (COUNT *
sizeof(Double) < 32) {
3112 return _mm_castsi128_pd(_mm_alignr_epi8(
3113 _mm_castpd_si128(h), _mm_castpd_si128(l), COUNT *
sizeof(Double)));
3115 return _mm_setzero_pd();
3129static SIMD_INLINE __m128i get_swizzle_mask(Integer<2>, Integer<1>)
3131 return _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
3134static SIMD_INLINE __m128i get_swizzle_mask(Integer<3>, Integer<1>)
3136 return _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1);
3139static SIMD_INLINE __m128i get_swizzle_mask(Integer<4>, Integer<1>)
3141 return _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
3144static SIMD_INLINE __m128i get_swizzle_mask(Integer<5>, Integer<1>)
3146 return _mm_setr_epi8(0, 5, 1, 6, 2, 7, 3, 8, 4, 9, -1, -1, -1, -1, -1, -1);
3151static SIMD_INLINE __m128i get_swizzle_mask(Integer<2>, Integer<2>)
3153 return _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
3156static SIMD_INLINE __m128i get_swizzle_mask(Integer<3>, Integer<2>)
3158 return _mm_setr_epi8(0, 1, 6, 7, 2, 3, 8, 9, 4, 5, 10, 11, -1, -1, -1, -1);
3161static SIMD_INLINE __m128i get_swizzle_mask(Integer<4>, Integer<2>)
3163 return _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
3166static SIMD_INLINE __m128i get_swizzle_mask(Integer<5>, Integer<2>)
3168 return _mm_setr_epi8(0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, -1, -1, -1,
3173template <
size_t N,
typename T>
3174static SIMD_INLINE __m128i get_swizzle_mask()
3176 return get_swizzle_mask(Integer<N>(), Integer<
sizeof(T)>());
3182template <
size_t ALIGNOFF>
3183static SIMD_INLINE __m128i align_shuffle_128(__m128i lo, __m128i hi,
3186 static_assert(ALIGNOFF >= 0 && ALIGNOFF < 32,
"");
3187 return _mm_shuffle_epi8(_mm_alignr_epi8(hi, lo, ALIGNOFF), mask);
3199template <
typename T>
3200static SIMD_INLINE
void swizzle(Vec<T, 16>[1], Integer<1>)
3208template <
typename T,
3209 SIMD_ENABLE_IF(
sizeof(T) <= 2 && std::is_integral<T>::value)>
3210static SIMD_INLINE
void swizzle(Vec<T, 16> v[2], Integer<2>)
3213 s[0] = _mm_shuffle_epi8(v[0], get_swizzle_mask<2, T>());
3214 s[1] = _mm_shuffle_epi8(v[1], get_swizzle_mask<2, T>());
3215 v[0] = _mm_unpacklo_epi64(s[0], s[1]);
3216 v[1] = _mm_unpackhi_epi64(s[0], s[1]);
3220template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3221static SIMD_INLINE
void swizzle(Vec<T, 16> v[2], Integer<2>)
3223 const __m128 v0tmp = reinterpret(v[0], OutputType<Float>());
3224 const __m128 v1tmp = reinterpret(v[1], OutputType<Float>());
3225 const Vec<Float, 16> v0TmpOut =
3226 _mm_shuffle_ps(v0tmp, v1tmp, _MM_SHUFFLE(2, 0, 2, 0));
3227 const Vec<Float, 16> v1TmpOut =
3228 _mm_shuffle_ps(v0tmp, v1tmp, _MM_SHUFFLE(3, 1, 3, 1));
3229 v[0] = reinterpret(v0TmpOut, OutputType<T>());
3230 v[1] = reinterpret(v1TmpOut, OutputType<T>());
3234template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3236static SIMD_INLINE
void swizzle(Vec<T, 16> v[2], Integer<2>)
3238 const __m128d v0tmp = reinterpret(v[0], OutputType<Double>());
3239 const __m128d v1tmp = reinterpret(v[1], OutputType<Double>());
3240 const Vec<Double, 16> v0TmpOut =
3241 _mm_shuffle_pd(v0tmp, v1tmp, _MM_SHUFFLE2(0, 0));
3242 const Vec<Double, 16> v1TmpOut =
3243 _mm_shuffle_pd(v0tmp, v1tmp, _MM_SHUFFLE2(1, 1));
3244 v[0] = reinterpret(v0TmpOut, OutputType<T>());
3245 v[1] = reinterpret(v1TmpOut, OutputType<T>());
3251template <
typename T,
3252 SIMD_ENABLE_IF(
sizeof(T) <= 2 && std::is_integral<T>::value)>
3253static SIMD_INLINE
void swizzle(Vec<T, 16> v[3], Integer<3>)
3255 __m128i mask = get_swizzle_mask<3, T>();
3256 __m128i s0 = align_shuffle_128<0>(v[0], v[1], mask);
3257 __m128i s1 = align_shuffle_128<12>(v[0], v[1], mask);
3258 __m128i s2 = align_shuffle_128<8>(v[1], v[2], mask);
3259 __m128i s3 = align_shuffle_128<4>(v[2], _mm_undefined_si128(), mask);
3260 __m128i l01 = _mm_unpacklo_epi32(s0, s1);
3261 __m128i h01 = _mm_unpackhi_epi32(s0, s1);
3262 __m128i l23 = _mm_unpacklo_epi32(s2, s3);
3263 __m128i h23 = _mm_unpackhi_epi32(s2, s3);
3264 v[0] = _mm_unpacklo_epi64(l01, l23);
3265 v[1] = _mm_unpackhi_epi64(l01, l23);
3266 v[2] = _mm_unpacklo_epi64(h01, h23);
3272template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3273static SIMD_INLINE
void swizzle(Vec<T, 16> v[3], Integer<3>)
3275 const __m128 x0y0z0x1 = reinterpret(v[0], OutputType<Float>());
3276 const __m128 y1z1x2y2 = reinterpret(v[1], OutputType<Float>());
3277 const __m128 z2x3y3z3 = reinterpret(v[2], OutputType<Float>());
3278 const __m128 x2y2x3y3 =
3279 _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
3280 const __m128 y0z0y1z1 =
3281 _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
3282 const Vec<Float, 16> x0x1x2x3 =
3283 _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
3284 const Vec<Float, 16> y0y1y2y3 =
3285 _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
3286 const Vec<Float, 16> z0z1z2z3 =
3287 _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1));
3288 v[0] = reinterpret(x0x1x2x3, OutputType<T>());
3289 v[1] = reinterpret(y0y1y2y3, OutputType<T>());
3290 v[2] = reinterpret(z0z1z2z3, OutputType<T>());
3294template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3295 typename = void,
typename =
void>
3296static SIMD_INLINE
void swizzle(Vec<T, 16> v[3], Integer<3>)
3298 const __m128d x0y0 = reinterpret(v[0], OutputType<Double>());
3299 const __m128d z0x1 = reinterpret(v[1], OutputType<Double>());
3300 const __m128d y1z1 = reinterpret(v[2], OutputType<Double>());
3301 const Vec<Double, 16> x0x1 = _mm_shuffle_pd(x0y0, z0x1, _MM_SHUFFLE2(1, 0));
3302 const Vec<Double, 16> y0y1 = _mm_shuffle_pd(x0y0, y1z1, _MM_SHUFFLE2(0, 1));
3303 const Vec<Double, 16> z0z1 = _mm_shuffle_pd(z0x1, y1z1, _MM_SHUFFLE2(1, 0));
3304 v[0] = reinterpret(x0x1, OutputType<T>());
3305 v[1] = reinterpret(y0y1, OutputType<T>());
3306 v[2] = reinterpret(z0z1, OutputType<T>());
3312template <
typename T,
3313 SIMD_ENABLE_IF(
sizeof(T) <= 2 && std::is_integral<T>::value)>
3314static SIMD_INLINE
void swizzle(Vec<T, 16> v[4], Integer<4>)
3316 __m128i mask = get_swizzle_mask<4, T>();
3318 s[0] = _mm_shuffle_epi8(v[0], mask);
3319 s[1] = _mm_shuffle_epi8(v[1], mask);
3320 s[2] = _mm_shuffle_epi8(v[2], mask);
3321 s[3] = _mm_shuffle_epi8(v[3], mask);
3322 __m128i l01 = _mm_unpacklo_epi32(s[0], s[1]);
3323 __m128i h01 = _mm_unpackhi_epi32(s[0], s[1]);
3324 __m128i l23 = _mm_unpacklo_epi32(s[2], s[3]);
3325 __m128i h23 = _mm_unpackhi_epi32(s[2], s[3]);
3326 v[0] = _mm_unpacklo_epi64(l01, l23);
3327 v[1] = _mm_unpackhi_epi64(l01, l23);
3328 v[2] = _mm_unpacklo_epi64(h01, h23);
3329 v[3] = _mm_unpackhi_epi64(h01, h23);
3333template <
typename T, SIMD_ENABLE_IF(sizeof(T) == 4),
typename =
void>
3334static SIMD_INLINE
void swizzle(Vec<T, 16> v[4], Integer<4>)
3337 for (
size_t i = 0; i < 4; ++i) {
3338 vFloat[i] = reinterpret(v[i], OutputType<Float>());
3341 s[0] = _mm_shuffle_ps(vFloat[0], vFloat[1], _MM_SHUFFLE(1, 0, 1, 0));
3342 s[1] = _mm_shuffle_ps(vFloat[0], vFloat[1], _MM_SHUFFLE(3, 2, 3, 2));
3343 s[2] = _mm_shuffle_ps(vFloat[2], vFloat[3], _MM_SHUFFLE(1, 0, 1, 0));
3344 s[3] = _mm_shuffle_ps(vFloat[2], vFloat[3], _MM_SHUFFLE(3, 2, 3, 2));
3345 Vec<Float, 16> vOut[4];
3346 vOut[0] = _mm_shuffle_ps(s[0], s[2], _MM_SHUFFLE(2, 0, 2, 0));
3347 vOut[1] = _mm_shuffle_ps(s[0], s[2], _MM_SHUFFLE(3, 1, 3, 1));
3348 vOut[2] = _mm_shuffle_ps(s[1], s[3], _MM_SHUFFLE(2, 0, 2, 0));
3349 vOut[3] = _mm_shuffle_ps(s[1], s[3], _MM_SHUFFLE(3, 1, 3, 1));
3350 for (
size_t i = 0; i < 4; ++i) {
3351 v[i] = reinterpret(vOut[i], OutputType<T>());
3356template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3358static SIMD_INLINE
void swizzle(Vec<T, 16> v[4], Integer<4>)
3360 const __m128d x0y0 = reinterpret(v[0], OutputType<Double>());
3361 const __m128d z0w0 = reinterpret(v[1], OutputType<Double>());
3362 const __m128d x1y1 = reinterpret(v[2], OutputType<Double>());
3363 const __m128d z1w1 = reinterpret(v[3], OutputType<Double>());
3364 const Vec<Double, 16> x0x1 = _mm_unpacklo_pd(x0y0, x1y1);
3365 const Vec<Double, 16> y0y1 = _mm_unpackhi_pd(x0y0, x1y1);
3366 const Vec<Double, 16> z0z1 = _mm_unpacklo_pd(z0w0, z1w1);
3367 const Vec<Double, 16> w0w1 = _mm_unpackhi_pd(z0w0, z1w1);
3368 v[0] = reinterpret(x0x1, OutputType<T>());
3369 v[1] = reinterpret(y0y1, OutputType<T>());
3370 v[2] = reinterpret(z0z1, OutputType<T>());
3371 v[3] = reinterpret(w0w1, OutputType<T>());
3377template <
typename T,
3378 SIMD_ENABLE_IF(
sizeof(T) == 1 && std::is_integral<T>::value)>
3379static SIMD_INLINE
void swizzle(Vec<T, 16> v[5], Integer<5>)
3381 __m128i mask = get_swizzle_mask<5, T>();
3382 __m128i s0 = align_shuffle_128<0>(v[0], v[1], mask);
3383 __m128i s1 = align_shuffle_128<10>(v[0], v[1], mask);
3384 __m128i s2 = align_shuffle_128<4>(v[1], v[2], mask);
3385 __m128i s3 = align_shuffle_128<14>(v[1], v[2], mask);
3386 __m128i s4 = align_shuffle_128<8>(v[2], v[3], mask);
3387 __m128i s5 = align_shuffle_128<2>(v[3], _mm_undefined_si128(), mask);
3388 __m128i s6 = align_shuffle_128<12>(v[3], v[4], mask);
3389 __m128i s7 = align_shuffle_128<6>(v[4], _mm_undefined_si128(), mask);
3390 __m128i l01 = _mm_unpacklo_epi16(s0, s1);
3391 __m128i h01 = _mm_unpackhi_epi16(s0, s1);
3392 __m128i l23 = _mm_unpacklo_epi16(s2, s3);
3393 __m128i h23 = _mm_unpackhi_epi16(s2, s3);
3394 __m128i l45 = _mm_unpacklo_epi16(s4, s5);
3395 __m128i h45 = _mm_unpackhi_epi16(s4, s5);
3396 __m128i l67 = _mm_unpacklo_epi16(s6, s7);
3397 __m128i h67 = _mm_unpackhi_epi16(s6, s7);
3398 __m128i ll01l23 = _mm_unpacklo_epi32(l01, l23);
3399 __m128i hl01l23 = _mm_unpackhi_epi32(l01, l23);
3400 __m128i ll45l67 = _mm_unpacklo_epi32(l45, l67);
3401 __m128i hl45l67 = _mm_unpackhi_epi32(l45, l67);
3402 __m128i lh01h23 = _mm_unpacklo_epi32(h01, h23);
3403 __m128i lh45h67 = _mm_unpacklo_epi32(h45, h67);
3404 v[0] = _mm_unpacklo_epi64(ll01l23, ll45l67);
3405 v[1] = _mm_unpackhi_epi64(ll01l23, ll45l67);
3406 v[2] = _mm_unpacklo_epi64(hl01l23, hl45l67);
3407 v[3] = _mm_unpackhi_epi64(hl01l23, hl45l67);
3408 v[4] = _mm_unpacklo_epi64(lh01h23, lh45h67);
3412template <
typename T,
3413 SIMD_ENABLE_IF(
sizeof(T) == 2 && std::is_integral<T>::value),
3415static SIMD_INLINE
void swizzle(Vec<T, 16> v[5], Integer<5>)
3417 __m128i mask = get_swizzle_mask<5, T>();
3418 __m128i s0 = align_shuffle_128<0>(v[0], v[1], mask);
3419 __m128i s1 = align_shuffle_128<6>(v[0], v[1], mask);
3420 __m128i s2 = align_shuffle_128<4>(v[1], v[2], mask);
3421 __m128i s3 = align_shuffle_128<10>(v[1], v[2], mask);
3422 __m128i s4 = align_shuffle_128<8>(v[2], v[3], mask);
3423 __m128i s5 = align_shuffle_128<14>(v[2], v[3], mask);
3424 __m128i s6 = align_shuffle_128<12>(v[3], v[4], mask);
3425 __m128i s7 = align_shuffle_128<2>(v[4], _mm_undefined_si128(), mask);
3426 __m128i l02 = _mm_unpacklo_epi32(s0, s2);
3427 __m128i h02 = _mm_unpackhi_epi32(s0, s2);
3428 __m128i l13 = _mm_unpacklo_epi32(s1, s3);
3429 __m128i l46 = _mm_unpacklo_epi32(s4, s6);
3430 __m128i h46 = _mm_unpackhi_epi32(s4, s6);
3431 __m128i l57 = _mm_unpacklo_epi32(s5, s7);
3432 v[0] = _mm_unpacklo_epi64(l02, l46);
3433 v[1] = _mm_unpackhi_epi64(l02, l46);
3434 v[2] = _mm_unpacklo_epi64(h02, h46);
3435 v[3] = _mm_unpacklo_epi64(l13, l57);
3436 v[4] = _mm_unpackhi_epi64(l13, l57);
3440template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 4),
typename = void,
3442static SIMD_INLINE
void swizzle(Vec<T, 16> vT[5], Integer<5>)
3445 for (
size_t i = 0; i < 5; i++) {
3446 v[i] = reinterpret(vT[i], OutputType<Int>());
3454 __m128i s2 = _mm_alignr_epi8(v[2], v[1], 4);
3458 __m128i s3 = _mm_alignr_epi8(v[3], v[2], 4);
3462 __m128i s4 = _mm_alignr_epi8(v[3], v[2], 8);
3466 __m128i s5 = _mm_alignr_epi8(v[4], v[3], 8);
3470 __m128i s6 = _mm_alignr_epi8(v[4], v[3], 12);
3474 __m128i s7 = _mm_alignr_epi8(v[0], v[4], 12);
3476 __m128i l02 = _mm_unpacklo_epi32(v[0], s2);
3477 __m128i h02 = _mm_unpackhi_epi32(v[0], s2);
3479 __m128i l13 = _mm_unpacklo_epi32(v[1], s3);
3481 __m128i l46 = _mm_unpacklo_epi32(s4, s6);
3482 __m128i h46 = _mm_unpackhi_epi32(s4, s6);
3484 __m128i l57 = _mm_unpacklo_epi32(s5, s7);
3486 const Vec<Int, 16> vOut[5] = {
3488 _mm_unpacklo_epi64(l02, l46),
3489 _mm_unpackhi_epi64(l02, l46),
3491 _mm_unpacklo_epi64(h02, h46),
3492 _mm_unpackhi_epi64(h02, h46),
3494 _mm_unpacklo_epi64(l13, l57),
3496 for (
size_t i = 0; i < 5; ++i) {
3497 vT[i] = reinterpret(vOut[i], OutputType<T>());
3502template <
typename T, SIMD_ENABLE_IF(
sizeof(T) == 8),
typename = void,
3503 typename = void,
typename =
void>
3504static SIMD_INLINE
void swizzle(Vec<T, 16> v[5], Integer<5>)
3506 const __m128d a0b0 = reinterpret(v[0], OutputType<Double>());
3507 const __m128d c0d0 = reinterpret(v[1], OutputType<Double>());
3508 const __m128d e0a1 = reinterpret(v[2], OutputType<Double>());
3509 const __m128d b1c1 = reinterpret(v[3], OutputType<Double>());
3510 const __m128d d1e1 = reinterpret(v[4], OutputType<Double>());
3511 const Vec<Double, 16> a0a1 = _mm_shuffle_pd(a0b0, e0a1, _MM_SHUFFLE2(1, 0));
3512 const Vec<Double, 16> b0b1 = _mm_shuffle_pd(a0b0, b1c1, _MM_SHUFFLE2(0, 1));
3513 const Vec<Double, 16> c0c1 = _mm_shuffle_pd(c0d0, b1c1, _MM_SHUFFLE2(1, 0));
3514 const Vec<Double, 16> d0d1 = _mm_shuffle_pd(c0d0, d1e1, _MM_SHUFFLE2(0, 1));
3515 const Vec<Double, 16> e0e1 = _mm_shuffle_pd(e0a1, d1e1, _MM_SHUFFLE2(1, 0));
3516 v[0] = reinterpret(a0a1, OutputType<T>());
3517 v[1] = reinterpret(b0b1, OutputType<T>());
3518 v[2] = reinterpret(c0c1, OutputType<T>());
3519 v[3] = reinterpret(d0d1, OutputType<T>());
3520 v[4] = reinterpret(e0e1, OutputType<T>());
3528static SIMD_INLINE Vec<Byte, 16> cmplt(
const Vec<Byte, 16> &a,
3529 const Vec<Byte, 16> &b)
3531 __m128i signbit = _mm_set1_epi32(0x80808080);
3532 __m128i a1 = _mm_xor_si128(a, signbit);
3533 __m128i b1 = _mm_xor_si128(b, signbit);
3534 return _mm_cmplt_epi8(a1, b1);
3537static SIMD_INLINE Vec<SignedByte, 16> cmplt(
const Vec<SignedByte, 16> &a,
3538 const Vec<SignedByte, 16> &b)
3540 return _mm_cmplt_epi8(a, b);
3543static SIMD_INLINE Vec<Word, 16> cmplt(
const Vec<Word, 16> &a,
3544 const Vec<Word, 16> &b)
3546 __m128i signbit = _mm_set1_epi32(0x80008000);
3547 __m128i a1 = _mm_xor_si128(a, signbit);
3548 __m128i b1 = _mm_xor_si128(b, signbit);
3549 return _mm_cmplt_epi16(a1, b1);
3552static SIMD_INLINE Vec<Short, 16> cmplt(
const Vec<Short, 16> &a,
3553 const Vec<Short, 16> &b)
3555 return _mm_cmplt_epi16(a, b);
3558static SIMD_INLINE Vec<Int, 16> cmplt(
const Vec<Int, 16> &a,
3559 const Vec<Int, 16> &b)
3561 return _mm_cmplt_epi32(a, b);
3564static SIMD_INLINE Vec<Long, 16> cmplt(
const Vec<Long, 16> &a,
3565 const Vec<Long, 16> &b)
3569 return _mm_cmpgt_epi64(b, a);
3572 const __m128i diff = _mm_sub_epi64(a, b);
3574 const __m128i res = _mm_xor_si128(
3575 diff, _mm_and_si128(_mm_xor_si128(a, b), _mm_xor_si128(diff, a)));
3577 const __m128i res = _mm_or_si128(_mm_andnot_si128(b, a),
3578 _mm_andnot_si128(_mm_xor_si128(a, b), diff));
3582 const __m128i spread32 = _mm_srai_epi32(res, 31);
3583 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3587static SIMD_INLINE Vec<Float, 16> cmplt(
const Vec<Float, 16> &a,
3588 const Vec<Float, 16> &b)
3590 return _mm_cmplt_ps(a, b);
3593static SIMD_INLINE Vec<Double, 16> cmplt(
const Vec<Double, 16> &a,
3594 const Vec<Double, 16> &b)
3596 return _mm_cmplt_pd(a, b);
3604static SIMD_INLINE Vec<Byte, 16> cmple(
const Vec<Byte, 16> &a,
3605 const Vec<Byte, 16> &b)
3607 __m128i signbit = _mm_set1_epi32(0x80808080);
3608 __m128i a1 = _mm_xor_si128(a, signbit);
3609 __m128i b1 = _mm_xor_si128(b, signbit);
3610 return _mm_or_si128(_mm_cmplt_epi8(a1, b1), _mm_cmpeq_epi8(a1, b1));
3613static SIMD_INLINE Vec<SignedByte, 16> cmple(
const Vec<SignedByte, 16> &a,
3614 const Vec<SignedByte, 16> &b)
3616 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3619static SIMD_INLINE Vec<Word, 16> cmple(
const Vec<Word, 16> &a,
3620 const Vec<Word, 16> &b)
3622 __m128i signbit = _mm_set1_epi32(0x80008000);
3623 __m128i a1 = _mm_xor_si128(a, signbit);
3624 __m128i b1 = _mm_xor_si128(b, signbit);
3625 return _mm_or_si128(_mm_cmplt_epi16(a1, b1), _mm_cmpeq_epi16(a1, b1));
3628static SIMD_INLINE Vec<Short, 16> cmple(
const Vec<Short, 16> &a,
3629 const Vec<Short, 16> &b)
3631 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3634static SIMD_INLINE Vec<Int, 16> cmple(
const Vec<Int, 16> &a,
3635 const Vec<Int, 16> &b)
3637 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3640static SIMD_INLINE Vec<Long, 16> cmple(
const Vec<Long, 16> &a,
3641 const Vec<Long, 16> &b)
3645 return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
3648 const __m128i res = _mm_and_si128(
3649 _mm_or_si128(a, _mm_xor_si128(b, _mm_set1_epi32(-1))),
3650 _mm_or_si128(_mm_xor_si128(a, b),
3651 _mm_xor_si128(_mm_sub_epi64(b, a), _mm_set1_epi32(-1))));
3654 const __m128i spread32 = _mm_srai_epi32(res, 31);
3655 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3659static SIMD_INLINE Vec<Float, 16> cmple(
const Vec<Float, 16> &a,
3660 const Vec<Float, 16> &b)
3662 return _mm_cmple_ps(a, b);
3665static SIMD_INLINE Vec<Double, 16> cmple(
const Vec<Double, 16> &a,
3666 const Vec<Double, 16> &b)
3668 return _mm_cmple_pd(a, b);
3675static SIMD_INLINE Vec<Byte, 16> cmpeq(
const Vec<Byte, 16> &a,
3676 const Vec<Byte, 16> &b)
3678 return _mm_cmpeq_epi8(a, b);
3681static SIMD_INLINE Vec<SignedByte, 16> cmpeq(
const Vec<SignedByte, 16> &a,
3682 const Vec<SignedByte, 16> &b)
3684 return _mm_cmpeq_epi8(a, b);
3687static SIMD_INLINE Vec<Word, 16> cmpeq(
const Vec<Word, 16> &a,
3688 const Vec<Word, 16> &b)
3690 return _mm_cmpeq_epi16(a, b);
3693static SIMD_INLINE Vec<Short, 16> cmpeq(
const Vec<Short, 16> &a,
3694 const Vec<Short, 16> &b)
3696 return _mm_cmpeq_epi16(a, b);
3699static SIMD_INLINE Vec<Int, 16> cmpeq(
const Vec<Int, 16> &a,
3700 const Vec<Int, 16> &b)
3702 return _mm_cmpeq_epi32(a, b);
3705static SIMD_INLINE Vec<Long, 16> cmpeq(
const Vec<Long, 16> &a,
3706 const Vec<Long, 16> &b)
3709 return _mm_cmpeq_epi64(a, b);
3711 const __m128i res32 = _mm_cmpeq_epi32(a, b);
3712 return _mm_and_si128(res32,
3713 _mm_shuffle_epi32(res32, _MM_SHUFFLE(2, 3, 0, 1)));
3717static SIMD_INLINE Vec<Float, 16> cmpeq(
const Vec<Float, 16> &a,
3718 const Vec<Float, 16> &b)
3720 return _mm_cmpeq_ps(a, b);
3723static SIMD_INLINE Vec<Double, 16> cmpeq(
const Vec<Double, 16> &a,
3724 const Vec<Double, 16> &b)
3726 return _mm_cmpeq_pd(a, b);
3734static SIMD_INLINE Vec<Byte, 16> cmpgt(
const Vec<Byte, 16> &a,
3735 const Vec<Byte, 16> &b)
3737 __m128i signbit = _mm_set1_epi32(0x80808080);
3738 __m128i a1 = _mm_xor_si128(a, signbit);
3739 __m128i b1 = _mm_xor_si128(b, signbit);
3740 return _mm_cmpgt_epi8(a1, b1);
3743static SIMD_INLINE Vec<SignedByte, 16> cmpgt(
const Vec<SignedByte, 16> &a,
3744 const Vec<SignedByte, 16> &b)
3746 return _mm_cmpgt_epi8(a, b);
3749static SIMD_INLINE Vec<Word, 16> cmpgt(
const Vec<Word, 16> &a,
3750 const Vec<Word, 16> &b)
3752 __m128i signbit = _mm_set1_epi32(0x80008000);
3753 __m128i a1 = _mm_xor_si128(a, signbit);
3754 __m128i b1 = _mm_xor_si128(b, signbit);
3755 return _mm_cmpgt_epi16(a1, b1);
3758static SIMD_INLINE Vec<Short, 16> cmpgt(
const Vec<Short, 16> &a,
3759 const Vec<Short, 16> &b)
3761 return _mm_cmpgt_epi16(a, b);
3764static SIMD_INLINE Vec<Int, 16> cmpgt(
const Vec<Int, 16> &a,
3765 const Vec<Int, 16> &b)
3767 return _mm_cmpgt_epi32(a, b);
3770static SIMD_INLINE Vec<Long, 16> cmpgt(
const Vec<Long, 16> &a,
3771 const Vec<Long, 16> &b)
3774 return _mm_cmpgt_epi64(a, b);
3777 const __m128i diff = _mm_sub_epi64(b, a);
3779 const __m128i res = _mm_xor_si128(
3780 diff, _mm_and_si128(_mm_xor_si128(b, a), _mm_xor_si128(diff, b)));
3782 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
3783 _mm_andnot_si128(_mm_xor_si128(b, a), diff));
3787 const __m128i spread32 = _mm_srai_epi32(res, 31);
3788 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3792static SIMD_INLINE Vec<Float, 16> cmpgt(
const Vec<Float, 16> &a,
3793 const Vec<Float, 16> &b)
3795 return _mm_cmpgt_ps(a, b);
3798static SIMD_INLINE Vec<Double, 16> cmpgt(
const Vec<Double, 16> &a,
3799 const Vec<Double, 16> &b)
3801 return _mm_cmpgt_pd(a, b);
3809static SIMD_INLINE Vec<Byte, 16> cmpge(
const Vec<Byte, 16> &a,
3810 const Vec<Byte, 16> &b)
3812 __m128i signbit = _mm_set1_epi32(0x80808080);
3813 __m128i a1 = _mm_xor_si128(a, signbit);
3814 __m128i b1 = _mm_xor_si128(b, signbit);
3815 return _mm_or_si128(_mm_cmpgt_epi8(a1, b1), _mm_cmpeq_epi8(a1, b1));
3818static SIMD_INLINE Vec<SignedByte, 16> cmpge(
const Vec<SignedByte, 16> &a,
3819 const Vec<SignedByte, 16> &b)
3821 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3824static SIMD_INLINE Vec<Word, 16> cmpge(
const Vec<Word, 16> &a,
3825 const Vec<Word, 16> &b)
3827 __m128i signbit = _mm_set1_epi32(0x80008000);
3828 __m128i a1 = _mm_xor_si128(a, signbit);
3829 __m128i b1 = _mm_xor_si128(b, signbit);
3830 return _mm_or_si128(_mm_cmpgt_epi16(a1, b1), _mm_cmpeq_epi16(a1, b1));
3833static SIMD_INLINE Vec<Short, 16> cmpge(
const Vec<Short, 16> &a,
3834 const Vec<Short, 16> &b)
3836 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3839static SIMD_INLINE Vec<Int, 16> cmpge(
const Vec<Int, 16> &a,
3840 const Vec<Int, 16> &b)
3842 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3845static SIMD_INLINE Vec<Long, 16> cmpge(
const Vec<Long, 16> &a,
3846 const Vec<Long, 16> &b)
3849 return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
3852 const __m128i res = _mm_and_si128(
3853 _mm_or_si128(b, _mm_xor_si128(a, _mm_set1_epi32(-1))),
3854 _mm_or_si128(_mm_xor_si128(b, a),
3855 _mm_xor_si128(_mm_sub_epi64(a, b), _mm_set1_epi32(-1))));
3858 const __m128i spread32 = _mm_srai_epi32(res, 31);
3859 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3863static SIMD_INLINE Vec<Float, 16> cmpge(
const Vec<Float, 16> &a,
3864 const Vec<Float, 16> &b)
3866 return _mm_cmpge_ps(a, b);
3869static SIMD_INLINE Vec<Double, 16> cmpge(
const Vec<Double, 16> &a,
3870 const Vec<Double, 16> &b)
3872 return _mm_cmpge_pd(a, b);
3882static SIMD_INLINE Vec<Byte, 16> cmpneq(
const Vec<Byte, 16> &a,
3883 const Vec<Byte, 16> &b)
3885 return _mm_xor_si128(_mm_cmpeq_epi8(a, b), _mm_set1_epi32(-1));
3888static SIMD_INLINE Vec<SignedByte, 16> cmpneq(
const Vec<SignedByte, 16> &a,
3889 const Vec<SignedByte, 16> &b)
3891 return _mm_xor_si128(_mm_cmpeq_epi8(a, b), _mm_set1_epi32(-1));
3894static SIMD_INLINE Vec<Word, 16> cmpneq(
const Vec<Word, 16> &a,
3895 const Vec<Word, 16> &b)
3897 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_set1_epi32(-1));
3900static SIMD_INLINE Vec<Short, 16> cmpneq(
const Vec<Short, 16> &a,
3901 const Vec<Short, 16> &b)
3903 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_set1_epi32(-1));
3906static SIMD_INLINE Vec<Int, 16> cmpneq(
const Vec<Int, 16> &a,
3907 const Vec<Int, 16> &b)
3909 return _mm_xor_si128(_mm_cmpeq_epi32(a, b), _mm_set1_epi32(-1));
3912static SIMD_INLINE Vec<Long, 16> cmpneq(
const Vec<Long, 16> &a,
3913 const Vec<Long, 16> &b)
3916 return _mm_xor_si128(_mm_cmpeq_epi64(a, b), _mm_set1_epi32(-1));
3918 const __m128i eq32 = _mm_cmpeq_epi32(a, b);
3919 const __m128i shuffledRes = _mm_shuffle_epi32(eq32, _MM_SHUFFLE(2, 3, 0, 1));
3920 const __m128i eq64 = _mm_and_si128(eq32, shuffledRes);
3921 return _mm_xor_si128(eq64, _mm_set1_epi32(-1));
3925static SIMD_INLINE Vec<Float, 16> cmpneq(
const Vec<Float, 16> &a,
3926 const Vec<Float, 16> &b)
3928 return _mm_cmpneq_ps(a, b);
3931static SIMD_INLINE Vec<Double, 16> cmpneq(
const Vec<Double, 16> &a,
3932 const Vec<Double, 16> &b)
3934 return _mm_cmpneq_pd(a, b);
3942template <
typename T>
3943static SIMD_INLINE Vec<T, 16> bit_and(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
3945 return _mm_and_si128(a, b);
3949static SIMD_INLINE Vec<Float, 16> bit_and(
const Vec<Float, 16> &a,
3950 const Vec<Float, 16> &b)
3952 return _mm_and_ps(a, b);
3956static SIMD_INLINE Vec<Double, 16> bit_and(
const Vec<Double, 16> &a,
3957 const Vec<Double, 16> &b)
3959 return _mm_and_pd(a, b);
3967template <
typename T>
3968static SIMD_INLINE Vec<T, 16> bit_or(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
3970 return _mm_or_si128(a, b);
3974static SIMD_INLINE Vec<Float, 16> bit_or(
const Vec<Float, 16> &a,
3975 const Vec<Float, 16> &b)
3977 return _mm_or_ps(a, b);
3981static SIMD_INLINE Vec<Double, 16> bit_or(
const Vec<Double, 16> &a,
3982 const Vec<Double, 16> &b)
3984 return _mm_or_pd(a, b);
3992template <
typename T>
3993static SIMD_INLINE Vec<T, 16> bit_andnot(
const Vec<T, 16> &a,
3994 const Vec<T, 16> &b)
3996 return _mm_andnot_si128(a, b);
4000static SIMD_INLINE Vec<Float, 16> bit_andnot(
const Vec<Float, 16> &a,
4001 const Vec<Float, 16> &b)
4003 return _mm_andnot_ps(a, b);
4007static SIMD_INLINE Vec<Double, 16> bit_andnot(
const Vec<Double, 16> &a,
4008 const Vec<Double, 16> &b)
4010 return _mm_andnot_pd(a, b);
4018template <
typename T>
4019static SIMD_INLINE Vec<T, 16> bit_xor(
const Vec<T, 16> &a,
const Vec<T, 16> &b)
4021 return _mm_xor_si128(a, b);
4025static SIMD_INLINE Vec<Float, 16> bit_xor(
const Vec<Float, 16> &a,
4026 const Vec<Float, 16> &b)
4028 return _mm_xor_ps(a, b);
4032static SIMD_INLINE Vec<Double, 16> bit_xor(
const Vec<Double, 16> &a,
4033 const Vec<Double, 16> &b)
4035 return _mm_xor_pd(a, b);
4043template <
typename T>
4044static SIMD_INLINE Vec<T, 16> bit_not(
const Vec<T, 16> &a)
4047 return _mm_xor_si128(a, _mm_set1_epi32(-1));
4051static SIMD_INLINE Vec<Float, 16> bit_not(
const Vec<Float, 16> &a)
4054 return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
4058static SIMD_INLINE Vec<Double, 16> bit_not(
const Vec<Double, 16> &a)
4061 return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1)));
4068static SIMD_INLINE Vec<Byte, 16> avg(
const Vec<Byte, 16> &a,
4069 const Vec<Byte, 16> &b)
4071 return _mm_avg_epu8(a, b);
4076static SIMD_INLINE Vec<SignedByte, 16> avg(
const Vec<SignedByte, 16> &a,
4077 const Vec<SignedByte, 16> &b)
4080 __m128i signbit = _mm_set1_epi32(0x80808080);
4081 __m128i a1 = _mm_xor_si128(a, signbit);
4082 __m128i b1 = _mm_xor_si128(b, signbit);
4083 __m128i m1 = _mm_avg_epu8(a1, b1);
4084 return _mm_xor_si128(m1, signbit);
4087static SIMD_INLINE Vec<Word, 16> avg(
const Vec<Word, 16> &a,
4088 const Vec<Word, 16> &b)
4090 return _mm_avg_epu16(a, b);
4095static SIMD_INLINE Vec<Short, 16> avg(
const Vec<Short, 16> &a,
4096 const Vec<Short, 16> &b)
4099 __m128i signbit = _mm_set1_epi32(0x80008000);
4100 __m128i a1 = _mm_xor_si128(a, signbit);
4101 __m128i b1 = _mm_xor_si128(b, signbit);
4102 __m128i m1 = _mm_avg_epu16(a1, b1);
4103 return _mm_xor_si128(m1, signbit);
4106static SIMD_INLINE Vec<Int, 16> avg(
const Vec<Int, 16> &a,
4107 const Vec<Int, 16> &b)
4110 return _mm_sub_epi32(_mm_or_si128(a, b),
4111 _mm_srai_epi32(_mm_xor_si128(a, b), 1));
4114static SIMD_INLINE Vec<Long, 16> avg(
const Vec<Long, 16> &a,
4115 const Vec<Long, 16> &b)
4118 return _mm_sub_epi64(_mm_or_si128(a, b),
4119 srai<1>(Vec<Long, 16>(_mm_xor_si128(a, b))));
4123static SIMD_INLINE Vec<Float, 16> avg(
const Vec<Float, 16> &a,
4124 const Vec<Float, 16> &b)
4126 __m128 half = _mm_set1_ps(0.5f);
4127 return _mm_mul_ps(_mm_add_ps(a, b), half);
4131static SIMD_INLINE Vec<Double, 16> avg(
const Vec<Double, 16> &a,
4132 const Vec<Double, 16> &b)
4134 __m128d half = _mm_set1_pd(0.5);
4135 return _mm_mul_pd(_mm_add_pd(a, b), half);
4142template <
typename T>
4143static SIMD_INLINE
bool test_all_zeros(
const Vec<T, 16> &a)
4146 const auto intA = reinterpret(a, OutputType<Int>());
4151 return _mm_test_all_zeros(intA, intA);
4153 return (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_setzero_si128(), intA)) ==
4162template <
typename T>
4163static SIMD_INLINE
bool test_all_ones(
const Vec<T, 16> &a)
4166 const auto intA = reinterpret(a, OutputType<Int>());
4168 return _mm_test_all_ones(intA);
4170 __m128i undef = _mm_undefined_si128();
4171 __m128i ones = _mm_cmpeq_epi8(undef, undef);
4172 return _mm_movemask_epi8(_mm_cmpeq_epi8(ones, intA)) == 0xffff;
4183static SIMD_INLINE Vec<Byte, 16> reverse(
const Vec<Byte, 16> &a)
4185 const __m128i mask =
4186 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4191 return _mm_shuffle_epi8(a, mask);
4194static SIMD_INLINE Vec<SignedByte, 16> reverse(
const Vec<SignedByte, 16> &a)
4196 const __m128i mask =
4197 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4199 return _mm_shuffle_epi8(a, mask);
4202static SIMD_INLINE Vec<Short, 16> reverse(
const Vec<Short, 16> &a)
4204 const __m128i mask =
4205 _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
4207 return _mm_shuffle_epi8(a, mask);
4210static SIMD_INLINE Vec<Word, 16> reverse(
const Vec<Word, 16> &a)
4212 const __m128i mask =
4213 _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
4215 return _mm_shuffle_epi8(a, mask);
4218static SIMD_INLINE Vec<Int, 16> reverse(
const Vec<Int, 16> &a)
4220 return _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
4223static SIMD_INLINE Vec<Long, 16> reverse(
const Vec<Long, 16> &a)
4225 return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
4228static SIMD_INLINE Vec<Float, 16> reverse(
const Vec<Float, 16> &a)
4230 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3));
4233static SIMD_INLINE Vec<Double, 16> reverse(
const Vec<Double, 16> &a)
4235 return _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1));
4244static SIMD_INLINE uint64_t msb2int(
const Vec<Byte, 16> &a)
4246 return _mm_movemask_epi8(a);
4249static SIMD_INLINE uint64_t msb2int(
const Vec<SignedByte, 16> &a)
4251 return _mm_movemask_epi8(a);
4254static SIMD_INLINE uint64_t msb2int(
const Vec<Short, 16> &a)
4259 const __m128i mask =
4260 _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 15, 13, 11, 9, 7, 5, 3, 1);
4261 const __m128i shuffled = _mm_shuffle_epi8(a, mask);
4262 return _mm_movemask_epi8(shuffled);
4265static SIMD_INLINE uint64_t msb2int(
const Vec<Word, 16> &a)
4270 const __m128i mask =
4271 _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 15, 13, 11, 9, 7, 5, 3, 1);
4272 const __m128i shuffled = _mm_shuffle_epi8(a, mask);
4273 return _mm_movemask_epi8(shuffled);
4276static SIMD_INLINE uint64_t msb2int(
const Vec<Int, 16> &a)
4278 return _mm_movemask_ps(_mm_castsi128_ps(a));
4281static SIMD_INLINE uint64_t msb2int(
const Vec<Long, 16> &a)
4283 return _mm_movemask_pd(_mm_castsi128_pd(a));
4286static SIMD_INLINE uint64_t msb2int(
const Vec<Float, 16> &a)
4288 return _mm_movemask_ps(a);
4291static SIMD_INLINE uint64_t msb2int(
const Vec<Double, 16> &a)
4293 return _mm_movemask_pd(a);
4302static SIMD_INLINE Vec<Byte, 16> int2msb(
const uint64_t a, OutputType<Byte>,
4307 __m128i shuffleIndeces = _mm_set_epi64x(0x0101010101010101, 0);
4308 __m128i aVec = _mm_shuffle_epi8(_mm_cvtsi32_si128(a), shuffleIndeces);
4310 __m128i maskLo = _mm_set_epi64x(0, 0xffffffffffffffff);
4311 __m128i aLo = _mm_and_si128(maskLo, _mm_set1_epi8(a));
4312 __m128i aHi = _mm_andnot_si128(maskLo, _mm_set1_epi8(a >> 8));
4313 __m128i aVec = _mm_or_si128(aLo, aHi);
4315 __m128i sel = _mm_set1_epi64x(0x8040201008040201);
4316 __m128i selected = _mm_and_si128(aVec, sel);
4317 __m128i result = _mm_cmpeq_epi8(selected, sel);
4318 return _mm_and_si128(result, _mm_set1_epi8((int8_t) 0x80));
4321static SIMD_INLINE Vec<SignedByte, 16> int2msb(
const uint64_t a,
4322 OutputType<SignedByte>,
4325 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<16>()),
4326 OutputType<SignedByte>());
4329static SIMD_INLINE Vec<Short, 16> int2msb(
const uint64_t a, OutputType<Short>,
4332 __m128i aVec = _mm_set1_epi16(a);
4333 __m128i sel = _mm_set_epi16(0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004,
4335 __m128i selected = _mm_and_si128(aVec, sel);
4336 __m128i result = _mm_cmpeq_epi16(selected, sel);
4337 return _mm_and_si128(result, _mm_set1_epi16((int16_t) 0x8000));
4340static SIMD_INLINE Vec<Word, 16> int2msb(
const uint64_t a, OutputType<Word>,
4343 return reinterpret(int2msb(a, OutputType<Short>(), Integer<16>()),
4344 OutputType<Word>());
4347static SIMD_INLINE Vec<Int, 16> int2msb(
const uint64_t a, OutputType<Int>,
4350 __m128i aVec = _mm_set1_epi32(a);
4351 __m128i sel = _mm_set_epi32(0x00000008, 0x00000004, 0x00000002, 0x00000001);
4352 __m128i selected = _mm_and_si128(aVec, sel);
4353 __m128i result = _mm_cmpeq_epi32(selected, sel);
4354 return _mm_and_si128(result, _mm_set1_epi32(0x80000000));
4357static SIMD_INLINE Vec<Long, 16> int2msb(
const uint64_t a, OutputType<Long>,
4360 return _mm_set_epi64x((a & 2) ? 0x8000000000000000 : 0,
4361 (a & 1) ? 0x8000000000000000 : 0);
4364static SIMD_INLINE Vec<Float, 16> int2msb(
const uint64_t a, OutputType<Float>,
4367 return reinterpret(int2msb(a, OutputType<Int>(), Integer<16>()),
4368 OutputType<Float>());
4371static SIMD_INLINE Vec<Double, 16> int2msb(
const uint64_t a, OutputType<Double>,
4374 return _mm_set_pd((a & 2) ? -0.0 : 0.0, (a & 1) ? -0.0 : 0.0);
4383static SIMD_INLINE Vec<Byte, 16> int2bits(
const uint64_t a, OutputType<Byte>,
4388 __m128i shuffleIndeces = _mm_set_epi64x(0x0101010101010101, 0);
4389 __m128i aVec = _mm_shuffle_epi8(_mm_cvtsi32_si128(a), shuffleIndeces);
4391 __m128i maskLo = _mm_set_epi64x(0, 0xffffffffffffffff);
4392 __m128i aLo = _mm_and_si128(maskLo, _mm_set1_epi8(a));
4393 __m128i aHi = _mm_andnot_si128(maskLo, _mm_set1_epi8(a >> 8));
4394 __m128i aVec = _mm_or_si128(aLo, aHi);
4396 __m128i sel = _mm_set1_epi64x(0x8040201008040201);
4397 __m128i selected = _mm_and_si128(aVec, sel);
4398 return _mm_cmpeq_epi8(selected, sel);
4401static SIMD_INLINE Vec<SignedByte, 16> int2bits(
const uint64_t a,
4402 OutputType<SignedByte>,
4405 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<16>()),
4406 OutputType<SignedByte>());
4409static SIMD_INLINE Vec<Short, 16> int2bits(
const uint64_t a, OutputType<Short>,
4412 __m128i aVec = _mm_set1_epi16(a);
4413 __m128i sel = _mm_set_epi16(0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004,
4415 __m128i selected = _mm_and_si128(aVec, sel);
4416 return _mm_cmpeq_epi16(selected, sel);
4419static SIMD_INLINE Vec<Word, 16> int2bits(
const uint64_t a, OutputType<Word>,
4422 return reinterpret(int2bits(a, OutputType<Short>(), Integer<16>()),
4423 OutputType<Word>());
4426static SIMD_INLINE Vec<Int, 16> int2bits(
const uint64_t a, OutputType<Int>,
4429 __m128i aVec = _mm_set1_epi32(a);
4430 __m128i sel = _mm_set_epi32(0x00000008, 0x00000004, 0x00000002, 0x00000001);
4431 __m128i selected = _mm_and_si128(aVec, sel);
4432 return _mm_cmpeq_epi32(selected, sel);
4435static SIMD_INLINE Vec<Long, 16> int2bits(
const uint64_t a, OutputType<Long>,
4438 return _mm_set_epi64x((a & 2) ? -1 : 0, (a & 1) ? -1 : 0);
4441static SIMD_INLINE Vec<Float, 16> int2bits(
const uint64_t a, OutputType<Float>,
4444 return reinterpret(int2bits(a, OutputType<Int>(), Integer<16>()),
4445 OutputType<Float>());
4448static SIMD_INLINE Vec<Double, 16> int2bits(
const uint64_t a,
4449 OutputType<Double>, Integer<16>)
4451 const auto trueVal = TypeInfo<Double>::trueval();
4452 return _mm_set_pd((a & 2) ? trueVal : 0.0, (a & 1) ? trueVal : 0.0);
4461static SIMD_INLINE Vec<Byte, 16> iota(OutputType<Byte>, Integer<16>)
4463 return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4466static SIMD_INLINE Vec<SignedByte, 16> iota(OutputType<SignedByte>, Integer<16>)
4468 return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4471static SIMD_INLINE Vec<Short, 16> iota(OutputType<Short>, Integer<16>)
4473 return _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
4476static SIMD_INLINE Vec<Word, 16> iota(OutputType<Word>, Integer<16>)
4478 return _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
4481static SIMD_INLINE Vec<Int, 16> iota(OutputType<Int>, Integer<16>)
4483 return _mm_set_epi32(3, 2, 1, 0);
4486static SIMD_INLINE Vec<Long, 16> iota(OutputType<Long>, Integer<16>)
4488 return _mm_set_epi64x(1, 0);
4491static SIMD_INLINE Vec<Float, 16> iota(OutputType<Float>, Integer<16>)
4493 return _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
4496static SIMD_INLINE Vec<Double, 16> iota(OutputType<Double>, Integer<16>)
4498 return _mm_set_pd(1.0, 0.0);
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161