53#ifndef SIMD_VEC_BASE_IMPL_NEON_16_H_
54#define SIMD_VEC_BASE_IMPL_NEON_16_H_
60#include "intrins_neon.H"
67#if defined(SIMDVEC_NEON_ENABLE) && defined(_SIMD_VEC_16_AVAIL_) && \
68 !defined(SIMDVEC_SANDBOX)
84template <>
struct _NEONRegType<
Byte> {
using Type = uint8x16_t; };
85template <>
struct _NEONRegType<
SignedByte> {
using Type = int8x16_t; };
86template <>
struct _NEONRegType<
Word> {
using Type = uint16x8_t; };
87template <>
struct _NEONRegType<
Short> {
using Type = int16x8_t; };
88template <>
struct _NEONRegType<
Int> {
using Type = int32x4_t; };
89template <>
struct _NEONRegType<
Float> {
using Type = float32x4_t; };
90#ifdef SIMD_64BIT_TYPES
91template <>
struct _NEONRegType<
Long> {
using Type = int64x2_t; };
92template <>
struct _NEONRegType<
Double> {
using Type = float64x2_t; };
97using NEONRegType =
typename _NEONRegType<T>::Type;
103template <
size_t N,
typename T>
104struct SIMDVecNeonArray64;
106#define SIMDVEC_NEON_ARRAY64(NUM, T, NEON_T) \
108 struct SIMDVecNeonArray64<NUM, T> \
110 using Type = NEON_T##x##NUM##_t; \
111 using ValType = NEON_T##_t; \
114#define SIMDVEC_NEON_ARRAY64_ALLNUM(T, NEON_T) \
115 SIMDVEC_NEON_ARRAY64(1, T, NEON_T) \
116 SIMDVEC_NEON_ARRAY64(2, T, NEON_T) \
117 SIMDVEC_NEON_ARRAY64(3, T, NEON_T) \
118 SIMDVEC_NEON_ARRAY64(4, T, NEON_T)
120SIMDVEC_NEON_ARRAY64_ALLNUM(Byte, uint8x8)
121SIMDVEC_NEON_ARRAY64_ALLNUM(SignedByte, int8x8)
122SIMDVEC_NEON_ARRAY64_ALLNUM(Word, uint16x4)
123SIMDVEC_NEON_ARRAY64_ALLNUM(Short, int16x4)
124SIMDVEC_NEON_ARRAY64_ALLNUM(Int, int32x2)
125SIMDVEC_NEON_ARRAY64_ALLNUM(Float, float32x2)
126#ifdef SIMD_64BIT_TYPES
127SIMDVEC_NEON_ARRAY64_ALLNUM(Double, float64x1)
130#undef SIMDVEC_NEON_ARRAY64
131#undef SIMDVEC_NEON_ARRAY64_ALLNUM
143 using RegType = internal::base::NEONRegType<T>;
148 static constexpr size_t elements = 16 /
sizeof(T);
150 static constexpr size_t bytes = 16;
153 Vec(
const RegType &x) { reg = x; }
154 Vec &operator=(
const RegType &x)
159 operator RegType()
const {
return reg; }
183#define SIMDVEC_NEON_BINARY(FCT, TYPE, NEON_FCT, NEON_SUF) \
184 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
185 const Vec<TYPE, 16> &b) \
187 return NEON_FCT##_##NEON_SUF(a, b); \
190#ifdef SIMD_64BIT_TYPES
191#define SIMDVEC_NEON_BINARY_ALLINT(FCT, NEON_FCT) \
192 SIMDVEC_NEON_BINARY(FCT, Byte, NEON_FCT, u8) \
193 SIMDVEC_NEON_BINARY(FCT, SignedByte, NEON_FCT, s8) \
194 SIMDVEC_NEON_BINARY(FCT, Word, NEON_FCT, u16) \
195 SIMDVEC_NEON_BINARY(FCT, Short, NEON_FCT, s16) \
196 SIMDVEC_NEON_BINARY(FCT, Int, NEON_FCT, s32) \
197 SIMDVEC_NEON_BINARY(FCT, Long, NEON_FCT, s64)
199#define SIMDVEC_NEON_BINARY_ALLINT(FCT, NEON_FCT) \
200 SIMDVEC_NEON_BINARY(FCT, Byte, NEON_FCT, u8) \
201 SIMDVEC_NEON_BINARY(FCT, SignedByte, NEON_FCT, s8) \
202 SIMDVEC_NEON_BINARY(FCT, Word, NEON_FCT, u16) \
203 SIMDVEC_NEON_BINARY(FCT, Short, NEON_FCT, s16) \
204 SIMDVEC_NEON_BINARY(FCT, Int, NEON_FCT, s32)
207#ifdef SIMD_64BIT_TYPES
208#define SIMDVEC_NEON_BINARY_ALLFLOAT(FCT, NEON_FCT) \
209 SIMDVEC_NEON_BINARY(FCT, Float, NEON_FCT, f32) \
210 SIMDVEC_NEON_BINARY(FCT, Double, NEON_FCT, f64)
212#define SIMDVEC_NEON_BINARY_ALLFLOAT(FCT, NEON_FCT) \
213 SIMDVEC_NEON_BINARY(FCT, Float, NEON_FCT, f32)
216#define SIMDVEC_NEON_BINARY_ALL(FCT, NEON_FCT) \
217 SIMDVEC_NEON_BINARY_ALLINT(FCT, NEON_FCT) \
218 SIMDVEC_NEON_BINARY_ALLFLOAT(FCT, NEON_FCT)
224#define SIMDVEC_NEON_UNARY(FCT, TYPE, NEON_FCT, NEON_SUF) \
225 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a) \
227 return NEON_FCT##_##NEON_SUF(a); \
243#define SIMDVEC_NEON_REINTERP(TDST, NEON_TDST, TSRC, NEON_TSRC) \
244 static SIMD_INLINE Vec<TDST, 16> reinterpret(const Vec<TSRC, 16> &vec, \
247 return vreinterpretq_##NEON_TDST##_##NEON_TSRC(vec); \
251#ifdef SIMD_64BIT_TYPES
252#define SIMDVEC_NEON_REINTERP_ALLDST(TSRC, NEON_TSRC) \
253 SIMDVEC_NEON_REINTERP(Byte, u8, TSRC, NEON_TSRC) \
254 SIMDVEC_NEON_REINTERP(SignedByte, s8, TSRC, NEON_TSRC) \
255 SIMDVEC_NEON_REINTERP(Word, u16, TSRC, NEON_TSRC) \
256 SIMDVEC_NEON_REINTERP(Short, s16, TSRC, NEON_TSRC) \
257 SIMDVEC_NEON_REINTERP(Int, s32, TSRC, NEON_TSRC) \
258 SIMDVEC_NEON_REINTERP(Long, s64, TSRC, NEON_TSRC) \
259 SIMDVEC_NEON_REINTERP(Float, f32, TSRC, NEON_TSRC) \
260 SIMDVEC_NEON_REINTERP(Double, f64, TSRC, NEON_TSRC)
262#define SIMDVEC_NEON_REINTERP_ALLDST(TSRC, NEON_TSRC) \
263 SIMDVEC_NEON_REINTERP(Byte, u8, TSRC, NEON_TSRC) \
264 SIMDVEC_NEON_REINTERP(SignedByte, s8, TSRC, NEON_TSRC) \
265 SIMDVEC_NEON_REINTERP(Word, u16, TSRC, NEON_TSRC) \
266 SIMDVEC_NEON_REINTERP(Short, s16, TSRC, NEON_TSRC) \
267 SIMDVEC_NEON_REINTERP(Int, s32, TSRC, NEON_TSRC) \
268 SIMDVEC_NEON_REINTERP(Float, f32, TSRC, NEON_TSRC)
272SIMDVEC_NEON_REINTERP_ALLDST(Byte, u8)
273SIMDVEC_NEON_REINTERP_ALLDST(SignedByte, s8)
274SIMDVEC_NEON_REINTERP_ALLDST(Word, u16)
275SIMDVEC_NEON_REINTERP_ALLDST(Short, s16)
276SIMDVEC_NEON_REINTERP_ALLDST(Int, s32)
277SIMDVEC_NEON_REINTERP_ALLDST(Float, f32)
278#ifdef SIMD_64BIT_TYPES
279SIMDVEC_NEON_REINTERP_ALLDST(Long, s64)
280SIMDVEC_NEON_REINTERP_ALLDST(Double, f64)
283#undef SIMDVEC_NEON_REINTERP_ALLDST
284#undef SIMDVEC_NEON_REINTERP
302static SIMD_INLINE Vec<Int, 16> cvts(
const Vec<Float, 16> &a, OutputType<Int>)
304 return vcvtq_s32_f32(a);
308static SIMD_INLINE Vec<Float, 16> cvts(
const Vec<Int, 16> &a, OutputType<Float>)
310 return vcvtq_f32_s32(a);
313#ifdef SIMD_64BIT_TYPES
314static SIMD_INLINE Vec<Long, 16> cvts(
const Vec<Double, 16> &a,
317 return vcvtq_s64_f64(a);
320static SIMD_INLINE Vec<Double, 16> cvts(
const Vec<Long, 16> &a,
323 return vcvtq_f64_s64(a);
331#define SIMDVEC_NEON_SETZERO(TYPE, NEON_SUF) \
332 static SIMD_INLINE Vec<TYPE, 16> setzero(OutputType<TYPE>, Integer<16>) \
334 return vmovq_n##_##NEON_SUF(TYPE(0)); \
337SIMDVEC_NEON_SETZERO(Byte, u8)
338SIMDVEC_NEON_SETZERO(SignedByte, s8)
339SIMDVEC_NEON_SETZERO(Word, u16)
340SIMDVEC_NEON_SETZERO(Short, s16)
341SIMDVEC_NEON_SETZERO(Int, s32)
342SIMDVEC_NEON_SETZERO(Float, f32)
343#ifdef SIMD_64BIT_TYPES
344SIMDVEC_NEON_SETZERO(Long, s64)
345SIMDVEC_NEON_SETZERO(Double, f64)
348#undef SIMDVEC_NEON_SETZERO
354#define SIMDVEC_NEON_SET1(TYPE, NEON_SUF) \
355 static SIMD_INLINE Vec<TYPE, 16> set1(TYPE a, Integer<16>) \
357 return vdupq_n##_##NEON_SUF(a); \
360SIMDVEC_NEON_SET1(Byte, u8)
361SIMDVEC_NEON_SET1(SignedByte, s8)
362SIMDVEC_NEON_SET1(Word, u16)
363SIMDVEC_NEON_SET1(Short, s16)
364SIMDVEC_NEON_SET1(Int, s32)
365SIMDVEC_NEON_SET1(Float, f32)
366#ifdef SIMD_64BIT_TYPES
367SIMDVEC_NEON_SET1(Long, s64)
368SIMDVEC_NEON_SET1(Double, f64)
371#undef SIMDVEC_NEON_SET1
377#define SIMDVEC_NEON_LOAD(TYPE, NEON_SUF) \
378 static SIMD_INLINE Vec<TYPE, 16> load(const TYPE *const p, Integer<16>) \
380 return vld1q##_##NEON_SUF(p); \
382 static SIMD_INLINE Vec<TYPE, 16> loadu(const TYPE *const p, Integer<16>) \
384 return vld1q##_##NEON_SUF(p); \
387SIMDVEC_NEON_LOAD(Byte, u8)
388SIMDVEC_NEON_LOAD(SignedByte, s8)
389SIMDVEC_NEON_LOAD(Word, u16)
390SIMDVEC_NEON_LOAD(Short, s16)
391SIMDVEC_NEON_LOAD(Int, s32)
392SIMDVEC_NEON_LOAD(Float, f32)
393#ifdef SIMD_64BIT_TYPES
394SIMDVEC_NEON_LOAD(Long, s64)
395SIMDVEC_NEON_LOAD(Double, f64)
398#undef SIMDVEC_NEON_LOAD
404#define SIMDVEC_NEON_STORE(TYPE, NEON_SUF) \
405 static SIMD_INLINE void store(TYPE *const p, const Vec<TYPE, 16> &a) \
407 return vst1q##_##NEON_SUF(p, a); \
409 static SIMD_INLINE void storeu(TYPE *const p, const Vec<TYPE, 16> &a) \
411 return vst1q##_##NEON_SUF(p, a); \
413 static SIMD_INLINE void stream_store(TYPE *const p, const Vec<TYPE, 16> &a) \
415 return vst1q##_##NEON_SUF(p, a); \
418SIMDVEC_NEON_STORE(Byte, u8)
419SIMDVEC_NEON_STORE(SignedByte, s8)
420SIMDVEC_NEON_STORE(Word, u16)
421SIMDVEC_NEON_STORE(Short, s16)
422SIMDVEC_NEON_STORE(Int, s32)
423SIMDVEC_NEON_STORE(Float, f32)
424#ifdef SIMD_64BIT_TYPES
425SIMDVEC_NEON_STORE(Long, s64)
426SIMDVEC_NEON_STORE(Double, f64)
429#undef SIMDVEC_NEON_STORE
440static SIMD_INLINE
void lfence()
442 SIMD_FULL_MEMBARRIER;
446static SIMD_INLINE
void sfence()
448 SIMD_FULL_MEMBARRIER;
452static SIMD_INLINE
void mfence()
454 SIMD_FULL_MEMBARRIER;
461#define SIMDVEC_NEON_EXTRACT(TYPE, NEON_SUF) \
462 template <size_t COUNT> \
463 static SIMD_INLINE TYPE extract(const Vec<TYPE, 16> &a) \
465 SIMD_IF_CONSTEXPR (COUNT < Vec<TYPE, 16>::elements) { \
466 return vgetq_lane##_##NEON_SUF(a, COUNT); \
472SIMDVEC_NEON_EXTRACT(Byte, u8)
473SIMDVEC_NEON_EXTRACT(SignedByte, s8)
474SIMDVEC_NEON_EXTRACT(Word, u16)
475SIMDVEC_NEON_EXTRACT(Short, s16)
476SIMDVEC_NEON_EXTRACT(Int, s32)
477SIMDVEC_NEON_EXTRACT(Float, f32)
478#ifdef SIMD_64BIT_TYPES
479SIMDVEC_NEON_EXTRACT(Long, s64)
480SIMDVEC_NEON_EXTRACT(Double, f64)
483#undef SIMDVEC_NEON_EXTRACT
489SIMDVEC_NEON_BINARY_ALL(add, vaddq)
495SIMDVEC_NEON_BINARY_ALLINT(adds, vqaddq)
497SIMDVEC_NEON_BINARY(adds, Float, vaddq, f32)
498#ifdef SIMD_64BIT_TYPES
499SIMDVEC_NEON_BINARY(adds, Double, vaddq, f64)
506SIMDVEC_NEON_BINARY_ALL(sub, vsubq)
512SIMDVEC_NEON_BINARY_ALLINT(subs, vqsubq)
514SIMDVEC_NEON_BINARY(subs, Float, vsubq, f32)
515#ifdef SIMD_64BIT_TYPES
516SIMDVEC_NEON_BINARY(subs, Double, vsubq, f64)
523SIMDVEC_NEON_UNARY(neg, SignedByte, vnegq, s8)
524SIMDVEC_NEON_UNARY(neg, Short, vnegq, s16)
525SIMDVEC_NEON_UNARY(neg, Int, vnegq, s32)
526SIMDVEC_NEON_UNARY(neg, Float, vnegq, f32)
527#ifdef SIMD_64BIT_TYPES
528SIMDVEC_NEON_UNARY(neg, Long, vnegq, s64)
529SIMDVEC_NEON_UNARY(neg, Double, vnegq, f64)
536SIMDVEC_NEON_BINARY(min, Byte, vminq, u8)
537SIMDVEC_NEON_BINARY(min, SignedByte, vminq, s8)
538SIMDVEC_NEON_BINARY(min, Word, vminq, u16)
539SIMDVEC_NEON_BINARY(min, Short, vminq, s16)
540SIMDVEC_NEON_BINARY(min, Int, vminq, s32)
541SIMDVEC_NEON_BINARY(min, Float, vminq, f32)
542#ifdef SIMD_64BIT_TYPES
543static SIMD_INLINE Vec<Long, 16> min(
const Vec<Long, 16> &a,
544 const Vec<Long, 16> &b)
547 return vbslq_s64(vcltq_s64(a, b), a, b);
549SIMDVEC_NEON_BINARY(min, Double, vminq, f64)
556SIMDVEC_NEON_BINARY(max, Byte, vmaxq, u8)
557SIMDVEC_NEON_BINARY(max, SignedByte, vmaxq, s8)
558SIMDVEC_NEON_BINARY(max, Word, vmaxq, u16)
559SIMDVEC_NEON_BINARY(max, Short, vmaxq, s16)
560SIMDVEC_NEON_BINARY(max, Int, vmaxq, s32)
561SIMDVEC_NEON_BINARY(max, Float, vmaxq, f32)
562#ifdef SIMD_64BIT_TYPES
563static SIMD_INLINE Vec<Long, 16> max(
const Vec<Long, 16> &a,
564 const Vec<Long, 16> &b)
567 return vbslq_s64(vcgtq_s64(a, b), a, b);
569SIMDVEC_NEON_BINARY(max, Double, vmaxq, f64)
576SIMDVEC_NEON_BINARY(mul, Float, vmulq, f32)
577#ifdef SIMD_64BIT_TYPES
578SIMDVEC_NEON_BINARY(mul, Double, vmulq, f64)
581const auto DIV_NEWTON_STEPS = 2;
584static SIMD_INLINE Vec<Float, 16> div(
const Vec<Float, 16> &num,
585 const Vec<Float, 16> &denom)
588 float32x4_t reciprocal = vrecpeq_f32(denom);
590 for (
size_t i = 0; i < DIV_NEWTON_STEPS; i++)
591 reciprocal = vmulq_f32(vrecpsq_f32(denom, reciprocal), reciprocal);
593 return vmulq_f32(num, reciprocal);
596#ifdef SIMD_64BIT_TYPES
597static SIMD_INLINE Vec<Double, 16> div(
const Vec<Double, 16> &num,
598 const Vec<Double, 16> &denom)
601 float64x2_t reciprocal = vrecpeq_f64(denom);
603 for (
size_t i = 0; i < DIV_NEWTON_STEPS; i++)
604 reciprocal = vmulq_f64(vrecpsq_f64(denom, reciprocal), reciprocal);
606 return vmulq_f64(num, reciprocal);
619static SIMD_INLINE Vec<T, 16> ceil(
const Vec<T, 16> &a)
621 static_assert(std::is_integral<T>::value,
"");
626static SIMD_INLINE Vec<T, 16> floor(
const Vec<T, 16> &a)
628 static_assert(std::is_integral<T>::value,
"");
633static SIMD_INLINE Vec<T, 16> round(
const Vec<T, 16> &a)
635 static_assert(std::is_integral<T>::value,
"");
640static SIMD_INLINE Vec<T, 16> truncate(
const Vec<T, 16> &a)
642 static_assert(std::is_integral<T>::value,
"");
653SIMDVEC_NEON_UNARY(ceil, Float, vrndpq, f32)
654SIMDVEC_NEON_UNARY(floor, Float, vrndmq, f32)
655SIMDVEC_NEON_UNARY(round, Float, vrndnq, f32)
656SIMDVEC_NEON_UNARY(truncate, Float, vrndq, f32)
658#ifdef SIMD_64BIT_TYPES
659SIMDVEC_NEON_UNARY(ceil, Double, vrndpq, f64)
660SIMDVEC_NEON_UNARY(floor, Double, vrndmq, f64)
661SIMDVEC_NEON_UNARY(round, Double, vrndnq, f64)
662SIMDVEC_NEON_UNARY(truncate, Double, vrndq, f64)
667static SIMD_INLINE Vec<Float, 16> truncate(
const Vec<Float, 16> &a)
670 float32x4_t limit = vmovq_n_f32(8388608.f);
672 uint32x4_t noRndReq = vcgeq_f32(vabsq_f32(a), limit);
674 float32x4_t aTrunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
676 return vbslq_f32(noRndReq, a, aTrunc);
689static SIMD_INLINE Vec<Float, 16> floor(
const Vec<Float, 16> &a)
692 float32x4_t limit = vmovq_n_f32(8388608.f);
694 uint32x4_t noRndReq = vcgeq_f32(vabsq_f32(a), limit);
697 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
699 float32x4_t aTrunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
701 uint32x4_t isNotInt = vmvnq_u32(vceqq_f32(a, aTrunc));
703 float32x4_t one = vmovq_n_f32(1.0f);
705 float32x4_t oneMask = vreinterpretq_f32_u32(
706 vandq_u32(vandq_u32(isNeg, isNotInt), vreinterpretq_u32_f32(one)));
709 aTrunc = vsubq_f32(aTrunc, oneMask);
711 return vbslq_f32(noRndReq, a, aTrunc);
714static SIMD_INLINE Vec<Float, 16> ceil(
const Vec<Float, 16> &a)
717 float32x4_t limit = vmovq_n_f32(8388608.f);
719 uint32x4_t noRndReq = vcgeq_f32(vabsq_f32(a), limit);
721 uint32x4_t isNotNeg =
722 vmvnq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31)));
724 float32x4_t aTrunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
726 uint32x4_t isNotInt = vmvnq_u32(vceqq_f32(a, aTrunc));
728 float32x4_t one = vmovq_n_f32(1.0f);
730 float32x4_t oneMask = vreinterpretq_f32_u32(
731 vandq_u32(vandq_u32(isNotNeg, isNotInt), vreinterpretq_u32_f32(one)));
734 aTrunc = vaddq_f32(aTrunc, oneMask);
736 return vbslq_f32(noRndReq, a, aTrunc);
740static SIMD_INLINE Vec<Float, 16> round(
const Vec<Float, 16> &a)
742 return floor(add(a, set1(
Float(0.5f), Integer<16>())));
752SIMDVEC_NEON_UNARY(rcp, Float, vrecpeq, f32)
753#ifdef SIMD_64BIT_TYPES
754SIMDVEC_NEON_UNARY(rcp, Double, vrecpeq, f64)
758SIMDVEC_NEON_UNARY(rsqrt, Float, vrsqrteq, f32)
759#ifdef SIMD_64BIT_TYPES
760SIMDVEC_NEON_UNARY(rsqrt, Double, vrsqrteq, f64)
763const auto SQRT_NEWTON_STEPS = 2;
766static SIMD_INLINE Vec<Float, 16> sqrt(
const Vec<Float, 16> &a)
769 float32x4_t zero = vmovq_n_f32(0.0f), one = vmovq_n_f32(1.0f);
771 uint32x4_t isZero = vceqq_f32(a, zero);
773 float32x4_t as = vbslq_f32(isZero, one, a);
775 float32x4_t rSqrt = vrsqrteq_f32(as);
777 for (
size_t i = 0; i < SQRT_NEWTON_STEPS; i++)
778 rSqrt = vmulq_f32(vrsqrtsq_f32(as, vmulq_f32(rSqrt, rSqrt)), rSqrt);
780 float32x4_t res = vmulq_f32(as, rSqrt);
782 return vbslq_f32(isZero, zero, res);
785#ifdef SIMD_64BIT_TYPES
786static SIMD_INLINE Vec<Double, 16> sqrt(
const Vec<Double, 16> &a)
789 float64x2_t zero = vmovq_n_f64(0.0), one = vmovq_n_f64(1.0);
791 uint64x2_t isZero = vceqq_f64(a, zero);
793 float64x2_t as = vbslq_f64(isZero, one, a);
795 float64x2_t rSqrt = vrsqrteq_f64(as);
797 for (
size_t i = 0; i < SQRT_NEWTON_STEPS; i++)
798 rSqrt = vmulq_f64(vrsqrtsq_f64(as, vmulq_f64(rSqrt, rSqrt)), rSqrt);
800 float64x2_t res = vmulq_f64(as, rSqrt);
802 return vbslq_f64(isZero, zero, res);
813template <
typename T, SIMD_ENABLE_IF(std::is_
unsigned<T>::value
814 &&std::is_
integral<T>::value)>
815static SIMD_INLINE Vec<T, 16> abs(
const Vec<T, 16> &a)
820SIMDVEC_NEON_UNARY(abs, SignedByte, vabsq, s8)
821SIMDVEC_NEON_UNARY(abs, Short, vabsq, s16)
822SIMDVEC_NEON_UNARY(abs, Int, vabsq, s32)
823SIMDVEC_NEON_UNARY(abs, Float, vabsq, f32)
824#ifdef SIMD_64BIT_TYPES
825SIMDVEC_NEON_UNARY(abs, Long, vabsq, s64)
826SIMDVEC_NEON_UNARY(abs, Double, vabsq, f64)
837#define SIMDVEC_NEON_UNPACK(TYPE, BYTES, NEON_SUF, NEON_SUF2) \
838 template <size_t PART> \
839 static SIMD_INLINE Vec<TYPE, 16> unpack( \
840 const Vec<TYPE, 16> &a, const Vec<TYPE, 16> &b, Part<PART>, Bytes<BYTES>) \
842 return vreinterpretq_##NEON_SUF##_##NEON_SUF2( \
843 (vzipq_##NEON_SUF2(vreinterpretq_##NEON_SUF2##_##NEON_SUF(a), \
844 vreinterpretq_##NEON_SUF2##_##NEON_SUF(b))) \
850#define SIMDVEC_NEON_UNPACK_HALFS(TYPE, BYTES, NEON_SUF) \
851 static SIMD_INLINE Vec<TYPE, 16> unpack( \
852 const Vec<TYPE, 16> &a, const Vec<TYPE, 16> &b, Part<0>, Bytes<BYTES>) \
854 return vcombine_##NEON_SUF(vget_low##_##NEON_SUF(a), \
855 vget_low##_##NEON_SUF(b)); \
857 static SIMD_INLINE Vec<TYPE, 16> unpack( \
858 const Vec<TYPE, 16> &a, const Vec<TYPE, 16> &b, Part<1>, Bytes<BYTES>) \
860 return vcombine_##NEON_SUF(vget_high##_##NEON_SUF(a), \
861 vget_high##_##NEON_SUF(b)); \
864SIMDVEC_NEON_UNPACK(Byte, 1, u8, u8)
865SIMDVEC_NEON_UNPACK(Byte, 2, u8, u16)
866SIMDVEC_NEON_UNPACK(Byte, 4, u8, u32)
867SIMDVEC_NEON_UNPACK_HALFS(Byte, 8, u8)
868SIMDVEC_NEON_UNPACK(SignedByte, 1, s8, s8)
869SIMDVEC_NEON_UNPACK(SignedByte, 2, s8, s16)
870SIMDVEC_NEON_UNPACK(SignedByte, 4, s8, s32)
871SIMDVEC_NEON_UNPACK_HALFS(SignedByte, 8, s8)
872SIMDVEC_NEON_UNPACK(Word, 2, u16, u16)
873SIMDVEC_NEON_UNPACK(Word, 4, u16, u32)
874SIMDVEC_NEON_UNPACK_HALFS(Word, 8, u16)
875SIMDVEC_NEON_UNPACK(Short, 2, s16, s16)
876SIMDVEC_NEON_UNPACK(Short, 4, s16, s32)
877SIMDVEC_NEON_UNPACK_HALFS(Short, 8, s16)
878SIMDVEC_NEON_UNPACK(Int, 4, s32, s32)
879SIMDVEC_NEON_UNPACK_HALFS(Int, 8, s32)
880SIMDVEC_NEON_UNPACK(Float, 4, f32, f32)
881SIMDVEC_NEON_UNPACK_HALFS(Float, 8, f32)
883#ifdef SIMD_64BIT_TYPES
884static SIMD_INLINE Vec<Long, 16> unpack(
const Vec<Long, 16> &a,
885 const Vec<Long, 16> &b, Part<0>,
888 return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
890static SIMD_INLINE Vec<Long, 16> unpack(
const Vec<Long, 16> &a,
891 const Vec<Long, 16> &b, Part<1>,
894 return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
896static SIMD_INLINE Vec<Double, 16> unpack(
const Vec<Double, 16> &a,
897 const Vec<Double, 16> &b, Part<0>,
900 return vcombine_f64(vget_low_f64(a), vget_low_f64(b));
902static SIMD_INLINE Vec<Double, 16> unpack(
const Vec<Double, 16> &a,
903 const Vec<Double, 16> &b, Part<1>,
906 return vcombine_f64(vget_high_f64(a), vget_high_f64(b));
910#undef SIMDVEC_NEON_UNPACK
911#undef SIMDVEC_NEON_UNPACK_HALFS
921template <
size_t PART,
size_t BYTES,
typename T>
922static SIMD_INLINE Vec<T, 16> unpack16(
const Vec<T, 16> &a,
const Vec<T, 16> &b,
923 Part<PART>, Bytes<BYTES>)
925 return unpack(a, b, Part<PART>(), Bytes<BYTES>());
932template <
size_t LANE_INDEX,
typename T>
933static SIMD_INLINE Vec<T, 16> extractLane(
const Vec<T, 16> &a)
945#define SIMDVEC_NEON_ZIP(TYPE, NUM_ELEMS, NEON_SUF, NEON_SUF2, NEONX2_2) \
946 static SIMD_INLINE void zip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
947 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
948 Elements<NUM_ELEMS>) \
951 res = vzipq_##NEON_SUF2(vreinterpretq_##NEON_SUF2##_##NEON_SUF(a), \
952 vreinterpretq_##NEON_SUF2##_##NEON_SUF(b)); \
953 c = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[0]); \
954 d = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[1]); \
959#define SIMDVEC_NEON_ZIP_HALFS(TYPE, NUM_ELEMS, NEON_SUF) \
960 static SIMD_INLINE void zip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
961 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
962 Elements<NUM_ELEMS>) \
964 c = vcombine_##NEON_SUF(vget_low_##NEON_SUF(a), vget_low_##NEON_SUF(b)); \
965 d = vcombine_##NEON_SUF(vget_high_##NEON_SUF(a), vget_high_##NEON_SUF(b)); \
968SIMDVEC_NEON_ZIP(Byte, 1, u8, u8, uint8x16x2_t)
969SIMDVEC_NEON_ZIP(Byte, 2, u8, u16, uint16x8x2_t)
970SIMDVEC_NEON_ZIP(Byte, 4, u8, u32, uint32x4x2_t)
971SIMDVEC_NEON_ZIP_HALFS(Byte, 8, u8)
972SIMDVEC_NEON_ZIP(SignedByte, 1, s8, s8, int8x16x2_t)
973SIMDVEC_NEON_ZIP(SignedByte, 2, s8, s16, int16x8x2_t)
974SIMDVEC_NEON_ZIP(SignedByte, 4, s8, s32, int32x4x2_t)
975SIMDVEC_NEON_ZIP_HALFS(SignedByte, 8, s8)
976SIMDVEC_NEON_ZIP(Word, 1, u16, u16, uint16x8x2_t)
977SIMDVEC_NEON_ZIP(Word, 2, u16, u32, uint32x4x2_t)
978SIMDVEC_NEON_ZIP_HALFS(Word, 4, u16)
979SIMDVEC_NEON_ZIP(Short, 1, s16, s16, int16x8x2_t)
980SIMDVEC_NEON_ZIP(Short, 2, s16, s32, int32x4x2_t)
981SIMDVEC_NEON_ZIP_HALFS(Short, 4, s16)
982SIMDVEC_NEON_ZIP(Int, 1, s32, s32, int32x4x2_t)
983SIMDVEC_NEON_ZIP_HALFS(Int, 2, s32)
984SIMDVEC_NEON_ZIP(Float, 1, f32, f32, float32x4x2_t)
985SIMDVEC_NEON_ZIP_HALFS(Float, 2, f32)
987#ifdef SIMD_64BIT_TYPES
988static SIMD_INLINE
void zip(
const Vec<Long, 16> a,
const Vec<Long, 16> b,
989 Vec<Long, 16> &c, Vec<Long, 16> &d, Elements<1>)
991 c = vcombine_s64(vget_low_s64(a), vget_low_s64(b));
992 d = vcombine_s64(vget_high_s64(a), vget_high_s64(b));
994static SIMD_INLINE
void zip(
const Vec<Double, 16> a,
const Vec<Double, 16> b,
995 Vec<Double, 16> &c, Vec<Double, 16> &d, Elements<1>)
997 c = vcombine_f64(vget_low_f64(a), vget_low_f64(b));
998 d = vcombine_f64(vget_high_f64(a), vget_high_f64(b));
1002template <
size_t NUM_ELEMS,
typename T>
1003static SIMD_INLINE
void zip(
const Vec<T, 16> a,
const Vec<T, 16> b,
1004 Vec<T, 16> &c, Vec<T, 16> &d)
1006 return zip(a, b, c, d, Elements<NUM_ELEMS>());
1009#undef SIMDVEC_NEON_ZIP
1010#undef SIMDVEC_NEON_ZIP_HALFS
1018template <
size_t NUM_ELEMS,
typename T>
1019static SIMD_INLINE
void zip16(
const Vec<T, 16> a,
const Vec<T, 16> b,
1020 Vec<T, 16> &l, Vec<T, 16> &h)
1022 zip<NUM_ELEMS, T>(a, b, l, h);
1036#define SIMDVEC_NEON_UNZIP(TYPE, BYTES, NEON_SUF, NEON_SUF2, NEONX2_2) \
1037 static SIMD_INLINE void unzip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
1038 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
1042 res = vuzpq_##NEON_SUF2(vreinterpretq_##NEON_SUF2##_##NEON_SUF(a), \
1043 vreinterpretq_##NEON_SUF2##_##NEON_SUF(b)); \
1044 c = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[0]); \
1045 d = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[1]); \
1050#define SIMDVEC_NEON_UNZIP_HALFS(TYPE, BYTES, NEON_SUF) \
1051 static SIMD_INLINE void unzip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
1052 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
1055 c = vcombine_##NEON_SUF(vget_low_##NEON_SUF(a), vget_low_##NEON_SUF(b)); \
1056 d = vcombine_##NEON_SUF(vget_high_##NEON_SUF(a), vget_high_##NEON_SUF(b)); \
1059SIMDVEC_NEON_UNZIP(Byte, 1, u8, u8, uint8x16x2_t)
1060SIMDVEC_NEON_UNZIP(Byte, 2, u8, u16, uint16x8x2_t)
1061SIMDVEC_NEON_UNZIP(Byte, 4, u8, u32, uint32x4x2_t)
1062SIMDVEC_NEON_UNZIP_HALFS(Byte, 8, u8)
1064SIMDVEC_NEON_UNZIP(SignedByte, 1, s8, s8, int8x16x2_t)
1065SIMDVEC_NEON_UNZIP(SignedByte, 2, s8, s16, int16x8x2_t)
1066SIMDVEC_NEON_UNZIP(SignedByte, 4, s8, s32, int32x4x2_t)
1067SIMDVEC_NEON_UNZIP_HALFS(SignedByte, 8, s8)
1069SIMDVEC_NEON_UNZIP(Word, 2, u16, u16, uint16x8x2_t)
1070SIMDVEC_NEON_UNZIP(Word, 4, u16, u32, uint32x4x2_t)
1071SIMDVEC_NEON_UNZIP_HALFS(Word, 8, u16)
1073SIMDVEC_NEON_UNZIP(Short, 2, s16, s16, int16x8x2_t)
1074SIMDVEC_NEON_UNZIP(Short, 4, s16, s32, int32x4x2_t)
1075SIMDVEC_NEON_UNZIP_HALFS(Short, 8, s16)
1077SIMDVEC_NEON_UNZIP(Int, 4, s32, s32, int32x4x2_t)
1078SIMDVEC_NEON_UNZIP_HALFS(Int, 8, s32)
1080SIMDVEC_NEON_UNZIP(Float, 4, f32, f32, float32x4x2_t)
1081SIMDVEC_NEON_UNZIP_HALFS(Float, 8, f32)
1083#ifdef SIMD_64BIT_TYPES
1084static SIMD_INLINE
void unzip(
const Vec<Long, 16> a,
const Vec<Long, 16> b,
1085 Vec<Long, 16> &c, Vec<Long, 16> &d, Bytes<8>)
1087 c = vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1088 d = vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1090static SIMD_INLINE
void unzip(
const Vec<Double, 16> a,
const Vec<Double, 16> b,
1091 Vec<Double, 16> &c, Vec<Double, 16> &d, Bytes<8>)
1093 c = vcombine_f64(vget_low_f64(a), vget_low_f64(b));
1094 d = vcombine_f64(vget_high_f64(a), vget_high_f64(b));
1098#undef SIMDVEC_NEON_UNZIP
1099#undef SIMDVEC_NEON_UNZIP_HALFS
1107static SIMD_INLINE Vec<SignedByte, 16> packs(
const Vec<Short, 16> &a,
1108 const Vec<Short, 16> &b,
1109 OutputType<SignedByte>)
1111 return vcombine_s8(vqmovn_s16(a), vqmovn_s16(b));
1114static SIMD_INLINE Vec<Short, 16> packs(
const Vec<Int, 16> &a,
1115 const Vec<Int, 16> &b,
1118 return vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
1121static SIMD_INLINE Vec<Short, 16> packs(
const Vec<Float, 16> &a,
1122 const Vec<Float, 16> &b,
1125 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
1126 OutputType<Short>());
1129#ifdef SIMD_64BIT_TYPES
1130static SIMD_INLINE Vec<Int, 16> packs(
const Vec<Long, 16> &a,
1131 const Vec<Long, 16> &b, OutputType<Int>)
1133 return vcombine_s32(vqmovn_s64(a), vqmovn_s64(b));
1136static SIMD_INLINE Vec<Int, 16> packs(
const Vec<Double, 16> &a,
1137 const Vec<Double, 16> &b, OutputType<Int>)
1139 return vcombine_s32(vqmovn_s64(vcvtq_s64_f64(a)),
1140 vqmovn_s64(vcvtq_s64_f64(b)));
1143static SIMD_INLINE Vec<Float, 16> packs(
const Vec<Long, 16> &a,
1144 const Vec<Long, 16> &b,
1147 return vcombine_f32(vcvt_f32_f64(vcvtq_f64_s64(a)),
1148 vcvt_f32_f64(vcvtq_f64_s64(b)));
1151static SIMD_INLINE Vec<Float, 16> packs(
const Vec<Double, 16> &a,
1152 const Vec<Double, 16> &b,
1155 return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b));
1161static SIMD_INLINE Vec<Byte, 16> packs(
const Vec<Word, 16> &a,
1162 const Vec<Word, 16> &b, OutputType<Byte>)
1164 return vcombine_u8(vqmovn_u16(a), vqmovn_u16(b));
1169static SIMD_INLINE Vec<Byte, 16> packs(
const Vec<Short, 16> &a,
1170 const Vec<Short, 16> &b,
1173 return vcombine_u8(vqmovun_s16(a), vqmovun_s16(b));
1176static SIMD_INLINE Vec<Word, 16> packs(
const Vec<Int, 16> &a,
1177 const Vec<Int, 16> &b, OutputType<Word>)
1179 return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
1182static SIMD_INLINE Vec<Word, 16> packs(
const Vec<Float, 16> &a,
1183 const Vec<Float, 16> &b,
1186 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
1187 OutputType<Word>());
1192static SIMD_INLINE Vec<SignedByte, 16> packs(
const Vec<Word, 16> &a,
1193 const Vec<Word, 16> &b,
1194 OutputType<SignedByte>)
1197 vreinterpret_s8_u8(vmin_u8(vqmovn_u16(a), vdup_n_u8(0x7f))),
1198 vreinterpret_s8_u8(vmin_u8(vqmovn_u16(b), vdup_n_u8(0x7f))));
1212template <
typename T>
1213static SIMD_INLINE
void extend(
const Vec<T, 16> &vIn, Vec<T, 16> vOut[1])
1220static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1221 Vec<Byte, 16> vOut[1])
1223 vOut[0] = vreinterpretq_u8_s8(vmaxq_s8(vIn, vdupq_n_s8(0)));
1226static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn,
1227 Vec<SignedByte, 16> vOut[1])
1229 vOut[0] = vreinterpretq_s8_u8(vminq_u8(vIn, vdupq_n_u8(0x7f)));
1232static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn, Vec<Word, 16> vOut[1])
1234 vOut[0] = vreinterpretq_u16_s16(vmaxq_s16(vIn, vdupq_n_s16(0)));
1237static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Short, 16> vOut[1])
1239 vOut[0] = vreinterpretq_s16_u16(vminq_u16(vIn, vdupq_n_u16(0x7fff)));
1248static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1249 Vec<Short, 16> vOut[2])
1251 vOut[0] = vmovl_s8(vget_low_s8(vIn));
1252 vOut[1] = vmovl_s8(vget_high_s8(vIn));
1255static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn, Vec<Int, 16> vOut[2])
1257 vOut[0] = vmovl_s16(vget_low_s16(vIn));
1258 vOut[1] = vmovl_s16(vget_high_s16(vIn));
1261static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn,
1262 Vec<Float, 16> vOut[2])
1264 vOut[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vIn)));
1265 vOut[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vIn)));
1268#ifdef SIMD_64BIT_TYPES
1269static SIMD_INLINE
void extend(
const Vec<Int, 16> &vIn, Vec<Long, 16> vOut[2])
1271 vOut[0] = vmovl_s32(vget_low_s32(vIn));
1272 vOut[1] = vmovl_s32(vget_high_s32(vIn));
1275static SIMD_INLINE
void extend(
const Vec<Int, 16> &vIn, Vec<Double, 16> vOut[2])
1277 vOut[0] = vcvtq_f64_s64(vmovl_s32(vget_low_s32(vIn)));
1278 vOut[1] = vcvtq_f64_s64(vmovl_s32(vget_high_s32(vIn)));
1281static SIMD_INLINE
void extend(
const Vec<Float, 16> &vIn, Vec<Long, 16> vOut[2])
1283 vOut[0] = vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(vIn)));
1284 vOut[1] = vcvtq_s64_f64(vcvt_f64_f32(vget_high_f32(vIn)));
1287static SIMD_INLINE
void extend(
const Vec<Float, 16> &vIn,
1288 Vec<Double, 16> vOut[2])
1290 vOut[0] = vcvt_f64_f32(vget_low_f32(vIn));
1291 vOut[1] = vcvt_f64_f32(vget_high_f32(vIn));
1297static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Word, 16> vOut[2])
1299 vOut[0] = vmovl_u8(vget_low_u8(vIn));
1300 vOut[1] = vmovl_u8(vget_high_u8(vIn));
1305static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Short, 16> vOut[2])
1307 vOut[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vIn)));
1308 vOut[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vIn)));
1311static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Int, 16> vOut[2])
1313 vOut[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vIn)));
1314 vOut[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vIn)));
1317static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Float, 16> vOut[2])
1319 vOut[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vIn)));
1320 vOut[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vIn)));
1325static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1326 Vec<Word, 16> vOut[2])
1328 const auto saturated = vmaxq_s8(vIn, vdupq_n_s8(0));
1329 vOut[0] = vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(saturated)));
1330 vOut[1] = vmovl_u8(vget_high_u8(vreinterpretq_u8_s8(saturated)));
1339static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1340 Vec<Int, 16> vOut[4])
1342 Vec<Short, 16> vShort[2];
1343 extend(vIn, vShort);
1344 extend(vShort[0], vOut);
1345 extend(vShort[1], vOut + 2);
1348static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1349 Vec<Float, 16> vOut[4])
1351 Vec<Short, 16> vShort[2];
1352 extend(vIn, vShort);
1353 extend(vShort[0], vOut);
1354 extend(vShort[1], vOut + 2);
1357#ifdef SIMD_64BIT_TYPES
1358static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn, Vec<Long, 16> vOut[4])
1360 Vec<Int, 16> vInt[2];
1362 extend(vInt[0], vOut);
1363 extend(vInt[1], vOut + 2);
1366static SIMD_INLINE
void extend(
const Vec<Short, 16> &vIn,
1367 Vec<Double, 16> vOut[4])
1369 Vec<Int, 16> vInt[2];
1371 extend(vInt[0], vOut);
1372 extend(vInt[1], vOut + 2);
1378static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Int, 16> vOut[4])
1380 Vec<Short, 16> vShort[2];
1381 extend(vIn, vShort);
1382 extend(vShort[0], vOut);
1383 extend(vShort[1], vOut + 2);
1386static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Float, 16> vOut[4])
1388 Vec<Short, 16> vShort[2];
1389 extend(vIn, vShort);
1390 extend(vShort[0], vOut);
1391 extend(vShort[1], vOut + 2);
1394#ifdef SIMD_64BIT_TYPES
1395static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn, Vec<Long, 16> vOut[4])
1397 Vec<Int, 16> vInt[2];
1399 extend(vInt[0], vOut);
1400 extend(vInt[1], vOut + 2);
1403static SIMD_INLINE
void extend(
const Vec<Word, 16> &vIn,
1404 Vec<Double, 16> vOut[4])
1406 Vec<Int, 16> vInt[2];
1408 extend(vInt[0], vOut);
1409 extend(vInt[1], vOut + 2);
1419#ifdef SIMD_64BIT_TYPES
1420static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1421 Vec<Long, 16> vOut[8])
1423 Vec<Int, 16> vInt[4];
1425 extend(vInt[0], vOut);
1426 extend(vInt[1], vOut + 2);
1427 extend(vInt[2], vOut + 4);
1428 extend(vInt[3], vOut + 6);
1431static SIMD_INLINE
void extend(
const Vec<SignedByte, 16> &vIn,
1432 Vec<Double, 16> vOut[8])
1434 Vec<Int, 16> vInt[4];
1436 extend(vInt[0], vOut);
1437 extend(vInt[1], vOut + 2);
1438 extend(vInt[2], vOut + 4);
1439 extend(vInt[3], vOut + 6);
1445#ifdef SIMD_64BIT_TYPES
1446static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn, Vec<Long, 16> vOut[8])
1448 Vec<Int, 16> vInt[4];
1450 extend(vInt[0], vOut);
1451 extend(vInt[1], vOut + 2);
1452 extend(vInt[2], vOut + 4);
1453 extend(vInt[3], vOut + 6);
1456static SIMD_INLINE
void extend(
const Vec<Byte, 16> &vIn,
1457 Vec<Double, 16> vOut[8])
1459 Vec<Int, 16> vInt[4];
1461 extend(vInt[0], vOut);
1462 extend(vInt[1], vOut + 2);
1463 extend(vInt[2], vOut + 4);
1464 extend(vInt[3], vOut + 6);
1472static SIMD_INLINE
void extend(
const Vec<Int, 16> &vIn, Vec<Float, 16> vOut[1])
1474 vOut[0] = cvts(vIn, OutputType<Float>());
1477static SIMD_INLINE
void extend(
const Vec<Float, 16> &vIn, Vec<Int, 16> vOut[1])
1479 vOut[0] = cvts(vIn, OutputType<Int>());
1482#ifdef SIMD_64BIT_TYPES
1483static SIMD_INLINE
void extend(
const Vec<Long, 16> &vIn,
1484 Vec<Double, 16> vOut[1])
1486 vOut[0] = cvts(vIn, OutputType<Double>());
1489static SIMD_INLINE
void extend(
const Vec<Double, 16> &vIn,
1490 Vec<Long, 16> vOut[1])
1492 vOut[0] = cvts(vIn, OutputType<Long>());
1508template <
bool nonZero,
bool inRange>
1509struct IsNonZeroInRange
1513template <
size_t RANGE,
size_t INDEX>
1514struct IsNonZeroInGivenRange
1515 :
public IsNonZeroInRange<(INDEX != 0), (INDEX < RANGE)>
1518#define SIMDVEC_NEON_SHIFT(FCT, TYPE, NEON_FCT, NEON_SUF) \
1519 template <size_t COUNT> \
1520 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1521 IsNonZeroInRange<true, true>) \
1523 return NEON_FCT##_##NEON_SUF(a, COUNT); \
1525 template <size_t COUNT> \
1526 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1527 IsNonZeroInRange<false, true>) \
1531 template <size_t COUNT> \
1532 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a) \
1534 return FCT<COUNT>(a, IsNonZeroInGivenRange<sizeof(TYPE) * 8, COUNT>()); \
1538#define SIMDVEC_NEON_SHIFT_ARITH(FCT, TYPE, NEON_FCT, NEON_SUF) \
1539 template <size_t COUNT> \
1540 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1541 IsNonZeroInRange<true, false>) \
1543 return NEON_FCT##_##NEON_SUF(a, sizeof(TYPE) * 8 - 1); \
1545 SIMDVEC_NEON_SHIFT(FCT, TYPE, NEON_FCT, NEON_SUF)
1548#define SIMDVEC_NEON_SHIFT_LOGICAL(FCT, TYPE, NEON_FCT, NEON_SUF) \
1549 template <size_t COUNT> \
1550 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &, \
1551 IsNonZeroInRange<true, false>) \
1553 return vmovq_n_##NEON_SUF(TYPE(0)); \
1555 SIMDVEC_NEON_SHIFT(FCT, TYPE, NEON_FCT, NEON_SUF)
1557#define SIMDVEC_NEON_SHIFT_REINTER(FCT, TYPE, NFCT, NSUF, NSUF2) \
1558 template <size_t COUNT> \
1559 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1560 IsNonZeroInRange<true, true>) \
1562 return vreinterpretq_##NSUF##_##NSUF2( \
1563 NFCT##_##NSUF2(vreinterpretq_##NSUF2##_##NSUF(a), COUNT)); \
1565 template <size_t COUNT> \
1566 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1567 IsNonZeroInRange<false, true>) \
1571 template <size_t COUNT> \
1572 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a) \
1574 return FCT<COUNT>(a, IsNonZeroInGivenRange<sizeof(TYPE) * 8, COUNT>()); \
1577#define SIMDVEC_NEON_SHIFT_REINTER_ARITH(FCT, TYPE, NFCT, NSUF, NSUF2) \
1578 template <size_t COUNT> \
1579 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1580 IsNonZeroInRange<true, false>) \
1582 return vreinterpretq_##NSUF##_##NSUF2(NFCT##_##NSUF2( \
1583 vreinterpretq_##NSUF2##_##NSUF(a), sizeof(TYPE) * 8 - 1)); \
1585 SIMDVEC_NEON_SHIFT_REINTER(FCT, TYPE, NFCT, NSUF, NSUF2)
1587#define SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(FCT, TYPE, NFCT, NSUF, NSUF2) \
1588 template <size_t COUNT> \
1589 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &, \
1590 IsNonZeroInRange<true, false>) \
1592 return vmovq_n_##NSUF(TYPE(0)); \
1594 SIMDVEC_NEON_SHIFT_REINTER(FCT, TYPE, NFCT, NSUF, NSUF2)
1605SIMDVEC_NEON_SHIFT_REINTER_ARITH(srai, Byte, vshrq_n, u8, s8)
1606SIMDVEC_NEON_SHIFT_ARITH(srai, SignedByte, vshrq_n, s8)
1607SIMDVEC_NEON_SHIFT_REINTER_ARITH(srai, Word, vshrq_n, u16, s16)
1608SIMDVEC_NEON_SHIFT_ARITH(srai, Short, vshrq_n, s16)
1609SIMDVEC_NEON_SHIFT_ARITH(srai, Int, vshrq_n, s32)
1610#ifdef SIMD_64BIT_TYPES
1611SIMDVEC_NEON_SHIFT_ARITH(srai, Long, vshrq_n, s64)
1620SIMDVEC_NEON_SHIFT_LOGICAL(srli, Byte, vshrq_n, u8)
1621SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, SignedByte, vshrq_n, s8, u8)
1622SIMDVEC_NEON_SHIFT_LOGICAL(srli, Word, vshrq_n, u16)
1623SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, Short, vshrq_n, s16, u16)
1624SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, Int, vshrq_n, s32, u32)
1625#ifdef SIMD_64BIT_TYPES
1626SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, Long, vshrq_n, s64, u64)
1632SIMDVEC_NEON_SHIFT_LOGICAL(slli, Byte, vshlq_n, u8)
1633SIMDVEC_NEON_SHIFT_LOGICAL(slli, SignedByte, vshlq_n, s8)
1634SIMDVEC_NEON_SHIFT_LOGICAL(slli, Word, vshlq_n, u16)
1635SIMDVEC_NEON_SHIFT_LOGICAL(slli, Short, vshlq_n, s16)
1636SIMDVEC_NEON_SHIFT_LOGICAL(slli, Int, vshlq_n, s32)
1637#ifdef SIMD_64BIT_TYPES
1638SIMDVEC_NEON_SHIFT_LOGICAL(slli, Long, vshlq_n, s64)
1641#undef SIMDVEC_NEON_SHIFT
1642#undef SIMDVEC_NEON_SHIFT_ARITH
1643#undef SIMDVEC_NEON_SHIFT_LOGICAL
1644#undef SIMDVEC_NEON_SHIFT_REINTER
1645#undef SIMDVEC_NEON_SHIFT_REINTER_ARITH
1646#undef SIMDVEC_NEON_SHIFT_REINTER_LOGICAL
1654static SIMD_INLINE Vec<Byte, 16> sra(const Vec<Byte, 16> &a,
1655 const uint8_t count)
1661 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1662 return vreinterpretq_u8_s8(
1663 vshlq_s8(vreinterpretq_s8_u8(a), vdupq_n_s8(scount)));
1666static SIMD_INLINE Vec<SignedByte, 16> sra(const Vec<SignedByte, 16> &a,
1667 const uint8_t count)
1673 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1674 return vshlq_s8(a, vdupq_n_s8(scount));
1677static SIMD_INLINE Vec<Word, 16> sra(const Vec<Word, 16> &a,
1678 const uint8_t count)
1684 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1685 return vreinterpretq_u16_s16(
1686 vshlq_s16(vreinterpretq_s16_u16(a), vdupq_n_s16(scount)));
1689static SIMD_INLINE Vec<Short, 16> sra(const Vec<Short, 16> &a,
1690 const uint8_t count)
1696 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1697 return vshlq_s16(a, vdupq_n_s16(scount));
1700static SIMD_INLINE Vec<Int, 16> sra(const Vec<Int, 16> &a, const uint8_t count)
1706 int8_t scount = -((int8_t) std::min(count, uint8_t(32)));
1707 return vshlq_s32(a, vdupq_n_s32(scount));
1710#ifdef SIMD_64BIT_TYPES
1711static SIMD_INLINE Vec<Long, 16> sra(const Vec<Long, 16> &a,
1712 const uint8_t count)
1718 int8_t scount = -((int8_t) std::min(count, uint8_t(64)));
1719 return vshlq_s64(a, vdupq_n_s64(scount));
1727static SIMD_INLINE Vec<Byte, 16> srl(const Vec<Byte, 16> &a,
1728 const uint8_t count)
1734 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1735 return vshlq_u8(a, vdupq_n_s8(scount));
1738static SIMD_INLINE Vec<SignedByte, 16> srl(const Vec<SignedByte, 16> &a,
1739 const uint8_t count)
1745 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1746 return vreinterpretq_s8_u8(
1747 vshlq_u8(vreinterpretq_u8_s8(a), vdupq_n_s8(scount)));
1750static SIMD_INLINE Vec<Word, 16> srl(const Vec<Word, 16> &a,
1751 const uint8_t count)
1757 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1758 return vshlq_u16(a, vdupq_n_s16(scount));
1761static SIMD_INLINE Vec<Short, 16> srl(const Vec<Short, 16> &a,
1762 const uint8_t count)
1768 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1769 return vreinterpretq_s16_u16(
1770 vshlq_u16(vreinterpretq_u16_s16(a), vdupq_n_s16(scount)));
1773static SIMD_INLINE Vec<Int, 16> srl(const Vec<Int, 16> &a, const uint8_t count)
1779 int8_t scount = -((int8_t) std::min(count, uint8_t(32)));
1780 return vreinterpretq_s32_u32(
1781 vshlq_u32(vreinterpretq_u32_s32(a), vdupq_n_s32(scount)));
1784#ifdef SIMD_64BIT_TYPES
1785static SIMD_INLINE Vec<Long, 16> srl(const Vec<Long, 16> &a,
1786 const uint8_t count)
1792 int8_t scount = -((int8_t) std::min(count, uint8_t(64)));
1793 return vreinterpretq_s64_u64(
1794 vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(scount)));
1802static SIMD_INLINE Vec<Byte, 16> sll(const Vec<Byte, 16> &a,
1803 const uint8_t count)
1809 return vshlq_u8(a, vdupq_n_s8(std::min(count, uint8_t(8))));
1812static SIMD_INLINE Vec<SignedByte, 16> sll(const Vec<SignedByte, 16> &a,
1813 const uint8_t count)
1819 return vshlq_s8(a, vdupq_n_s8(std::min(count, uint8_t(8))));
1822static SIMD_INLINE Vec<Word, 16> sll(const Vec<Word, 16> &a,
1823 const uint8_t count)
1829 return vshlq_u16(a, vdupq_n_s16(std::min(count, uint8_t(16))));
1832static SIMD_INLINE Vec<Short, 16> sll(const Vec<Short, 16> &a,
1833 const uint8_t count)
1839 return vshlq_s16(a, vdupq_n_s16(std::min(count, uint8_t(16))));
1842static SIMD_INLINE Vec<Int, 16> sll(const Vec<Int, 16> &a, const uint8_t count)
1848 return vshlq_s32(a, vdupq_n_s32(std::min(count, uint8_t(32))));
1851#ifdef SIMD_64BIT_TYPES
1852static SIMD_INLINE Vec<Long, 16> sll(const Vec<Long, 16> &a,
1853 const uint8_t count)
1859 return vshlq_s64(a, vdupq_n_s64(std::min(count, uint8_t(64))));
1871#define SIMDVEC_NEON_HADD(TYPE, NEON_SUF) \
1872 static SIMD_INLINE Vec<TYPE, 16> hadd(const Vec<TYPE, 16> &a, \
1873 const Vec<TYPE, 16> &b) \
1875 return vcombine_##NEON_SUF( \
1876 vpadd_##NEON_SUF(vget_low_##NEON_SUF(a), vget_high_##NEON_SUF(a)), \
1877 vpadd_##NEON_SUF(vget_low_##NEON_SUF(b), vget_high_##NEON_SUF(b))); \
1880SIMDVEC_NEON_HADD(Byte, u8)
1881SIMDVEC_NEON_HADD(SignedByte, s8)
1882SIMDVEC_NEON_HADD(Word, u16)
1883SIMDVEC_NEON_HADD(Short, s16)
1884SIMDVEC_NEON_HADD(Int, s32)
1885SIMDVEC_NEON_HADD(Float, f32)
1886#ifdef SIMD_64BIT_TYPES
1889static SIMD_INLINE Vec<Long, 16> hadd(const Vec<Long, 16> &a,
1890 const Vec<Long, 16> &b)
1892 return vcombine_s64(vget_low_s64(a) + vget_high_s64(a),
1893 vget_low_s64(b) + vget_high_s64(b));
1897static SIMD_INLINE Vec<Double, 16> hadd(const Vec<Double, 16> &a,
1898 const Vec<Double, 16> &b)
1900 return vcombine_f64(vget_low_f64(a) + vget_high_f64(a),
1901 vget_low_f64(b) + vget_high_f64(b));
1905#undef SIMDVEC_NEON_HADD
1911template <typename T>
1912static SIMD_INLINE Vec<T, 16> hadds(const Vec<T, 16> &a, const Vec<T, 16> &b)
1915 unzip(a, b, x, y, Bytes<sizeof(T)>());
1919static SIMD_INLINE Vec<Short, 16> hadds(const Vec<Short, 16> &a,
1920 const Vec<Short, 16> &b)
1922 return vcombine_s16(vqmovn_s32(vpaddlq_s16(a)), vqmovn_s32(vpaddlq_s16(b)));
1925static SIMD_INLINE Vec<Int, 16> hadds(const Vec<Int, 16> &a,
1926 const Vec<Int, 16> &b)
1928 return vcombine_s32(vqmovn_s64(vpaddlq_s32(a)), vqmovn_s64(vpaddlq_s32(b)));
1932static SIMD_INLINE Vec<Float, 16> hadds(const Vec<Float, 16> &a,
1933 const Vec<Float, 16> &b)
1938#ifdef SIMD_64BIT_TYPES
1940static SIMD_INLINE Vec<Double, 16> hadds(const Vec<Double, 16> &a,
1941 const Vec<Double, 16> &b)
1951template <typename T>
1952static SIMD_INLINE Vec<T, 16> hsub(const Vec<T, 16> &a, const Vec<T, 16> &b)
1955 unzip(a, b, x, y, Bytes<sizeof(T)>());
1959#ifdef SIMD_64BIT_TYPES
1960static SIMD_INLINE Vec<Double, 16> hsub(const Vec<Double, 16> &a,
1961 const Vec<Double, 16> &b)
1963 return vcombine_f64(vget_low_f64(a) - vget_high_f64(a),
1964 vget_low_f64(b) - vget_high_f64(b));
1972template <typename T>
1973static SIMD_INLINE Vec<T, 16> hsubs(const Vec<T, 16> &a, const Vec<T, 16> &b)
1976 unzip(a, b, x, y, Bytes<sizeof(T)>());
1980#ifdef SIMD_64BIT_TYPES
1982static SIMD_INLINE Vec<Double, 16> hsubs(const Vec<Double, 16> &a,
1983 const Vec<Double, 16> &b)
1985 return vcombine_f64(vget_low_f64(a) - vget_high_f64(a),
1986 vget_low_f64(b) - vget_high_f64(b));
1994#define SIMDVEC_NEON_ALIGNRE(TYPE, NEON_SUF) \
1995 template <size_t COUNT> \
1996 static SIMD_INLINE Vec<TYPE, 16> alignre( \
1997 const Vec<TYPE, 16> &, const Vec<TYPE, 16> &l, \
1998 Range<true, 0, Vec<TYPE, 16>::elements>) \
2002 template <size_t COUNT> \
2003 static SIMD_INLINE Vec<TYPE, 16> alignre( \
2004 const Vec<TYPE, 16> &h, const Vec<TYPE, 16> &l, \
2005 Range<false, 0, Vec<TYPE, 16>::elements>) \
2007 return vextq_##NEON_SUF(l, h, COUNT); \
2009 template <size_t COUNT> \
2010 static SIMD_INLINE Vec<TYPE, 16> alignre( \
2011 const Vec<TYPE, 16> &h, const Vec<TYPE, 16> &, \
2012 Range<true, Vec<TYPE, 16>::elements, 2 * Vec<TYPE, 16>::elements>) \
2016 template <size_t COUNT> \
2017 static SIMD_INLINE Vec<TYPE, 16> alignre( \
2018 const Vec<TYPE, 16> &h, const Vec<TYPE, 16> &, \
2019 Range<false, Vec<TYPE, 16>::elements, 2 * Vec<TYPE, 16>::elements>) \
2021 return vextq_##NEON_SUF(h, vmovq_n_##NEON_SUF(TYPE(0)), \
2022 COUNT - Vec<TYPE, 16>::elements); \
2024 template <size_t COUNT, bool AT_LL, size_t LL_INCL, size_t UL_EXCL> \
2025 static SIMD_INLINE Vec<TYPE, 16> alignre(const Vec<TYPE, 16> &, \
2026 const Vec<TYPE, 16> &, \
2027 Range<AT_LL, LL_INCL, UL_EXCL>) \
2029 return vmovq_n_##NEON_SUF(TYPE(0)); \
2032SIMDVEC_NEON_ALIGNRE(Byte, u8)
2033SIMDVEC_NEON_ALIGNRE(SignedByte, s8)
2034SIMDVEC_NEON_ALIGNRE(Word, u16)
2035SIMDVEC_NEON_ALIGNRE(Short, s16)
2036SIMDVEC_NEON_ALIGNRE(Int, s32)
2037SIMDVEC_NEON_ALIGNRE(Float, f32)
2038#ifdef SIMD_64BIT_TYPES
2039SIMDVEC_NEON_ALIGNRE(Long, s64)
2040SIMDVEC_NEON_ALIGNRE(Double, f64)
2043template <size_t COUNT, typename T>
2044static SIMD_INLINE Vec<T, 16> alignre(const Vec<T, 16> &h, const Vec<T, 16> &l)
2046 return alignre<COUNT>(h, l, SizeRange<COUNT, Vec<T, 16>::elements>());
2049#undef SIMDVEC_NEON_ALIGNRE
2056template <size_t COUNT, typename T>
2057static SIMD_INLINE Vec<T, 16> srle(const Vec<T, 16> &a)
2059 return alignre<COUNT>(setzero(OutputType<T>(), Integer<16>()), a);
2068template <size_t COUNT, typename T>
2069static SIMD_INLINE Vec<T, 16> slle(const Vec<T, 16> &a)
2071 SIMD_IF_CONSTEXPR (COUNT < Vec<T, 16>::elements) {
2072 return alignre<Vec<T, 16>::elements - COUNT>(
2073 a, setzero(OutputType<T>(), Integer<16>()));
2075 return setzero(OutputType<T>(), Integer<16>());
2085static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<2>,
2088 const uint8x8_t table[2] SIMD_ATTR_ALIGNED(16) = {
2089 {0, 2, 4, 6, 8, 10, 12, 14},
2090 {1, 3, 5, 7, 9, 11, 13, 15},
2092 return table[index];
2095static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<3>,
2098 const uint8x8_t table[3] SIMD_ATTR_ALIGNED(16) = {
2099 {0, 3, 6, 9, 12, 15, 18, 21},
2100 {1, 4, 7, 10, 13, 16, 19, 22},
2101 {2, 5, 8, 11, 14, 17, 20, 23},
2103 return table[index];
2106static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<4>,
2109 const uint8x8_t table[4] SIMD_ATTR_ALIGNED(16) = {
2110 {0, 4, 8, 12, 16, 20, 24, 28},
2111 {1, 5, 9, 13, 17, 21, 25, 29},
2112 {2, 6, 10, 14, 18, 22, 26, 30},
2113 {3, 7, 11, 15, 19, 23, 27, 31},
2115 return table[index];
2118static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<2>,
2121 const uint8x8_t table[2] SIMD_ATTR_ALIGNED(16) = {
2122 {0, 1, 4, 5, 8, 9, 12, 13},
2123 {2, 3, 6, 7, 10, 11, 14, 15},
2125 return table[index];
2128static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<3>,
2131 const uint8x8_t table[3] SIMD_ATTR_ALIGNED(16) = {
2132 {0, 1, 6, 7, 12, 13, 18, 19},
2133 {2, 3, 8, 9, 14, 15, 20, 21},
2134 {4, 5, 10, 11, 16, 17, 22, 23},
2136 return table[index];
2139static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<4>,
2142 const uint8x8_t table[4] SIMD_ATTR_ALIGNED(16) = {
2143 {0, 1, 8, 9, 16, 17, 24, 25},
2144 {2, 3, 10, 11, 18, 19, 26, 27},
2145 {4, 5, 12, 13, 20, 21, 28, 29},
2146 {6, 7, 14, 15, 22, 23, 30, 31},
2148 return table[index];
2151static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<2>,
2154 const uint8x8_t table[2] SIMD_ATTR_ALIGNED(16) = {
2155 {0, 1, 2, 3, 8, 9, 10, 11},
2156 {4, 5, 6, 7, 12, 13, 14, 15},
2158 return table[index];
2161static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<3>,
2164 const uint8x8_t table[3] SIMD_ATTR_ALIGNED(16) = {
2165 {0, 1, 2, 3, 12, 13, 14, 15},
2166 {4, 5, 6, 7, 16, 17, 18, 19},
2167 {8, 9, 10, 11, 20, 21, 22, 23},
2169 return table[index];
2172static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<4>,
2175 const uint8x8_t table[4] SIMD_ATTR_ALIGNED(16) = {
2176 {0, 1, 2, 3, 16, 17, 18, 19},
2177 {4, 5, 6, 7, 20, 21, 22, 23},
2178 {8, 9, 10, 11, 24, 25, 26, 27},
2179 {12, 13, 14, 15, 28, 29, 30, 31},
2181 return table[index];
2184template <size_t N, typename T>
2185static SIMD_INLINE uint8x8_t swizzleTable(const size_t index)
2187 return swizzleTable(index, Integer<N>(), Integer<sizeof(T)>());
2190template <typename T>
2191static SIMD_INLINE void swizzle(Vec<T, 16>[1], Integer<1>)
2196template <typename T>
2197static SIMD_INLINE void swizzle(Vec<T, 16> v[2], Integer<2>)
2199 const Vec<Byte, 16> vByte[2] = {
2200 reinterpret(v[0], OutputType<Byte>()),
2201 reinterpret(v[1], OutputType<Byte>()),
2203 for (size_t i = 0; i < 2; i++) {
2205 reinterpret(Vec<Byte, 16>(vcombine_u8(
2206 vtbl2_u8({vget_low_u8(vByte[0]), vget_high_u8(vByte[0])},
2207 swizzleTable<2, T>(i)),
2208 vtbl2_u8({vget_low_u8(vByte[1]), vget_high_u8(vByte[1])},
2209 swizzleTable<2, T>(i)))),
2214#ifdef SIMD_64BIT_TYPES
2215static SIMD_INLINE void swizzle(Vec<Long, 16> v[2], Integer<2>)
2217 const Vec<Long, 16> tmp[2] = {v[0], v[1]};
2218 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_low_s64(tmp[1]));
2219 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_high_s64(tmp[1]));
2222static SIMD_INLINE void swizzle(Vec<Double, 16> v[2], Integer<2>)
2224 const Vec<Double, 16> tmp[2] = {v[0], v[1]};
2225 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_low_f64(tmp[1]));
2226 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_high_f64(tmp[1]));
2230template <typename T>
2231static SIMD_INLINE void swizzle(Vec<T, 16> v[3], Integer<3>)
2233 const Vec<Byte, 16> vByte[3] = {
2234 reinterpret(v[0], OutputType<Byte>()),
2235 reinterpret(v[1], OutputType<Byte>()),
2236 reinterpret(v[2], OutputType<Byte>()),
2238 const uint8x8x3_t vu[2] = {
2239 {vget_low_u8(vByte[0]), vget_high_u8(vByte[0]), vget_low_u8(vByte[1])},
2240 {vget_high_u8(vByte[1]), vget_low_u8(vByte[2]), vget_high_u8(vByte[2])},
2242 for (size_t i = 0; i < 3; i++) {
2244 Vec<Byte, 16>(vcombine_u8(vtbl3_u8(vu[0], swizzleTable<3, T>(i)),
2245 vtbl3_u8(vu[1], swizzleTable<3, T>(i)))),
2250#ifdef SIMD_64BIT_TYPES
2251static SIMD_INLINE void swizzle(Vec<Long, 16> v[3], Integer<3>)
2253 const Vec<Long, 16> tmp[3] = {v[0], v[1], v[2]};
2254 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_high_s64(tmp[1]));
2255 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_low_s64(tmp[2]));
2256 v[2] = vcombine_s64(vget_low_s64(tmp[1]), vget_high_s64(tmp[2]));
2259static SIMD_INLINE void swizzle(Vec<Double, 16> v[3], Integer<3>)
2261 const Vec<Double, 16> tmp[3] = {v[0], v[1], v[2]};
2262 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_high_f64(tmp[1]));
2263 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_low_f64(tmp[2]));
2264 v[2] = vcombine_f64(vget_low_f64(tmp[1]), vget_high_f64(tmp[2]));
2268template <typename T>
2269static SIMD_INLINE void swizzle(Vec<T, 16> v[4], Integer<4>)
2271 const Vec<Byte, 16> vByte[4] = {
2272 reinterpret(v[0], OutputType<Byte>()),
2273 reinterpret(v[1], OutputType<Byte>()),
2274 reinterpret(v[2], OutputType<Byte>()),
2275 reinterpret(v[3], OutputType<Byte>()),
2277 const uint8x8x4_t vu[2] = {
2278 {vget_low_u8(vByte[0]), vget_high_u8(vByte[0]), vget_low_u8(vByte[1]),
2279 vget_high_u8(vByte[1])},
2280 {vget_low_u8(vByte[2]), vget_high_u8(vByte[2]), vget_low_u8(vByte[3]),
2281 vget_high_u8(vByte[3])},
2283 for (size_t i = 0; i < 4; i++) {
2285 Vec<Byte, 16>(vcombine_u8(vtbl4_u8(vu[0], swizzleTable<4, T>(i)),
2286 vtbl4_u8(vu[1], swizzleTable<4, T>(i)))),
2291#ifdef SIMD_64BIT_TYPES
2292static SIMD_INLINE void swizzle(Vec<Long, 16> v[4], Integer<4>)
2294 const Vec<Long, 16> tmp[4] = {v[0], v[1], v[2], v[3]};
2295 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_low_s64(tmp[2]));
2296 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_high_s64(tmp[2]));
2297 v[2] = vcombine_s64(vget_low_s64(tmp[1]), vget_low_s64(tmp[3]));
2298 v[3] = vcombine_s64(vget_high_s64(tmp[1]), vget_high_s64(tmp[3]));
2301static SIMD_INLINE void swizzle(Vec<Double, 16> v[4], Integer<4>)
2303 const Vec<Double, 16> tmp[4] = {v[0], v[1], v[2], v[3]};
2304 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_low_f64(tmp[2]));
2305 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_high_f64(tmp[2]));
2306 v[2] = vcombine_f64(vget_low_f64(tmp[1]), vget_low_f64(tmp[3]));
2307 v[3] = vcombine_f64(vget_high_f64(tmp[1]), vget_high_f64(tmp[3]));
2316static const uint8_t swizzleMask5Lo[5][32] SIMD_ATTR_ALIGNED(16) = {
2318 {0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17,
2319 3, 8, 13, 18, 4, 9, 14, 19, 99, 99, 99, 99},
2320 {0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15,
2321 6, 7, 16, 17, 8, 9, 18, 19, 99, 99, 99, 99},
2323 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2324 12, 13, 14, 15, 16, 17, 18, 19, 99, 99, 99, 99},
2328static const uint8_t swizzleMask5Hi[5][32] SIMD_ATTR_ALIGNED(16) = {
2330 {4, 9, 14, 19, 5, 10, 15, 20, 6, 11, 16, 21,
2331 7, 12, 17, 22, 8, 13, 18, 23, 99, 99, 99, 99},
2332 {4, 5, 14, 15, 6, 7, 16, 17, 8, 9, 18, 19,
2333 10, 11, 20, 21, 12, 13, 22, 23, 99, 99, 99, 99},
2335 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2336 16, 17, 18, 19, 20, 21, 22, 23, 99, 99, 99, 99},
2340template <size_t SIZE>
2344 uint8x8x3_t table[2];
2347 for (size_t i = 0; i < 3; i++) {
2349 table[0].val[i] = vld1_u8(&swizzleMask5Lo[SIZE][i * 8]);
2351 table[1].val[i] = vld1_u8(&swizzleMask5Hi[SIZE][i * 8]);
2357template <typename T>
2358static SIMD_INLINE void swizzle(Vec<T, 16> v[5], Integer<5>)
2393 const size_t k0[4] = {0, 2, 5, 7};
2394 for (size_t i = 0; i < 4; i++) {
2395 for (size_t j = 0; j < 3; j++) {
2396 const size_t k = k0[i] + j;
2397 const Vec<Byte, 16> vb = reinterpret(v[k >> 1], OutputType<Byte>());
2398 vu[i].val[j] = (k & 1) ? vget_high_u8(vb) : vget_low_u8(vb);
2401 static const SwizzleTable5<sizeof(T)> t;
2402 uint8x8_t r[2][3][3];
2405 for (size_t n = 0, k = 0; n < 2; n++)
2407 for (size_t i = 0; i < 2; i++, k++)
2409 for (size_t j = 0; j < 3; j++)
2411 r[n][i][j] = vtbl3_u8(vu[k], t.table[i].val[j]);
2413 int32x2x2_t z[2][3];
2414 for (size_t n = 0; n < 2; n++)
2415 for (size_t j = 0; j < 3; j++)
2416 z[n][j] = vzip_s32(vreinterpret_s32_u8(r[n][0][j]),
2417 vreinterpret_s32_u8(r[n][1][j]));
2419 for (size_t j = 0, k = 0; j < 3; j++) {
2420 for (size_t lh = 0; lh < 2; lh++) {
2422 Vec<Int, 16>(vcombine_s32(z[0][j].val[lh], z[1][j].val[lh])),
2430#ifdef SIMD_64BIT_TYPES
2431static SIMD_INLINE void swizzle(Vec<Long, 16> v[5], Integer<5>)
2433 const Vec<Long, 16> tmp[5] = {v[0], v[1], v[2], v[3], v[4]};
2434 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_high_s64(tmp[2]));
2435 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_low_s64(tmp[3]));
2436 v[2] = vcombine_s64(vget_low_s64(tmp[1]), vget_high_s64(tmp[3]));
2437 v[3] = vcombine_s64(vget_high_s64(tmp[1]), vget_low_s64(tmp[4]));
2438 v[4] = vcombine_s64(vget_low_s64(tmp[2]), vget_high_s64(tmp[4]));
2441static SIMD_INLINE void swizzle(Vec<Double, 16> v[5], Integer<5>)
2443 const Vec<Double, 16> tmp[5] = {v[0], v[1], v[2], v[3], v[4]};
2444 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_high_f64(tmp[2]));
2445 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_low_f64(tmp[3]));
2446 v[2] = vcombine_f64(vget_low_f64(tmp[1]), vget_high_f64(tmp[3]));
2447 v[3] = vcombine_f64(vget_high_f64(tmp[1]), vget_low_f64(tmp[4]));
2448 v[4] = vcombine_f64(vget_low_f64(tmp[2]), vget_high_f64(tmp[4]));
2456#define SIMDVEC_NEON_CMP(CMP, TYPE, NEON_SUF, NEON_USUF) \
2457 static SIMD_INLINE Vec<TYPE, 16> cmp##CMP(const Vec<TYPE, 16> &a, \
2458 const Vec<TYPE, 16> &b) \
2460 return vreinterpretq_##NEON_SUF##_##NEON_USUF( \
2461 vc##CMP##q##_##NEON_SUF(a, b)); \
2464#ifdef SIMD_64BIT_TYPES
2465#define SIMDVEC_NEON_CMP_ALL(CMP) \
2466 SIMDVEC_NEON_CMP(CMP, Byte, u8, u8) \
2467 SIMDVEC_NEON_CMP(CMP, SignedByte, s8, u8) \
2468 SIMDVEC_NEON_CMP(CMP, Word, u16, u16) \
2469 SIMDVEC_NEON_CMP(CMP, Short, s16, u16) \
2470 SIMDVEC_NEON_CMP(CMP, Int, s32, u32) \
2471 SIMDVEC_NEON_CMP(CMP, Long, s64, u64) \
2472 SIMDVEC_NEON_CMP(CMP, Float, f32, u32) \
2473 SIMDVEC_NEON_CMP(CMP, Double, f64, u64)
2475#define SIMDVEC_NEON_CMP_ALL(CMP) \
2476 SIMDVEC_NEON_CMP(CMP, Byte, u8, u8) \
2477 SIMDVEC_NEON_CMP(CMP, SignedByte, s8, u8) \
2478 SIMDVEC_NEON_CMP(CMP, Word, u16, u16) \
2479 SIMDVEC_NEON_CMP(CMP, Short, s16, u16) \
2480 SIMDVEC_NEON_CMP(CMP, Int, s32, u32) \
2481 SIMDVEC_NEON_CMP(CMP, Float, f32, u32)
2484SIMDVEC_NEON_CMP_ALL(lt)
2485SIMDVEC_NEON_CMP_ALL(le)
2486SIMDVEC_NEON_CMP_ALL(eq)
2487SIMDVEC_NEON_CMP_ALL(gt)
2488SIMDVEC_NEON_CMP_ALL(ge)
2490#undef SIMDVEC_NEON_CMP_ALL
2491#undef SIMDVEC_NEON_CMP
2497#define SIMDVEC_NEON_CMPNEQ(TYPE, NEON_SUF, NEON_USUF) \
2498 static SIMD_INLINE Vec<TYPE, 16> cmpneq(const Vec<TYPE, 16> &a, \
2499 const Vec<TYPE, 16> &b) \
2501 return vreinterpretq_##NEON_SUF##_u32( \
2502 vmvnq_u32(vreinterpretq_u32_##NEON_USUF(vceqq_##NEON_SUF(a, b)))); \
2505SIMDVEC_NEON_CMPNEQ(Byte, u8, u8)
2506SIMDVEC_NEON_CMPNEQ(SignedByte, s8, u8)
2507SIMDVEC_NEON_CMPNEQ(Word, u16, u16)
2508SIMDVEC_NEON_CMPNEQ(Short, s16, u16)
2509SIMDVEC_NEON_CMPNEQ(Int, s32, u32)
2510SIMDVEC_NEON_CMPNEQ(Float, f32, u32)
2511#ifdef SIMD_64BIT_TYPES
2512SIMDVEC_NEON_CMPNEQ(Long, s64, u64)
2513SIMDVEC_NEON_CMPNEQ(Double, f64, u64)
2516#undef SIMDVEC_NEON_CMPNEQ
2523#define SIMDVEC_NEON_IFELSE(T, NEON_SUF, NEON_USUF) \
2524 static SIMD_INLINE Vec<T, 16> ifelse(const Vec<T, 16> &cond, \
2525 const Vec<T, 16> &trueVal, \
2526 const Vec<T, 16> &falseVal) \
2528 return vbslq_##NEON_SUF(vreinterpretq_##NEON_USUF##_##NEON_SUF(cond), \
2529 trueVal, falseVal); \
2532SIMDVEC_NEON_IFELSE(Byte, u8, u8)
2533SIMDVEC_NEON_IFELSE(SignedByte, s8, u8)
2534SIMDVEC_NEON_IFELSE(Word, u16, u16)
2535SIMDVEC_NEON_IFELSE(Short, s16, u16)
2536SIMDVEC_NEON_IFELSE(Int, s32, u32)
2537SIMDVEC_NEON_IFELSE(Float, f32, u32)
2538#ifdef SIMD_64BIT_TYPES
2539SIMDVEC_NEON_IFELSE(Long, s64, u64)
2540SIMDVEC_NEON_IFELSE(Double, f64, u64)
2543#undef SIMDVEC_NEON_IFELSE
2549SIMDVEC_NEON_BINARY_ALLINT(bit_and, vandq)
2551static SIMD_INLINE Vec<Float, 16> bit_and(const Vec<Float, 16> &a,
2552 const Vec<Float, 16> &b)
2554 return vreinterpretq_f32_s32(
2555 vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)));
2558#ifdef SIMD_64BIT_TYPES
2559static SIMD_INLINE Vec<Double, 16> bit_and(const Vec<Double, 16> &a,
2560 const Vec<Double, 16> &b)
2562 return vreinterpretq_f64_s64(
2563 vandq_s64(vreinterpretq_s64_f64(a), vreinterpretq_s64_f64(b)));
2571SIMDVEC_NEON_BINARY_ALLINT(bit_or, vorrq)
2573static SIMD_INLINE Vec<Float, 16> bit_or(const Vec<Float, 16> &a,
2574 const Vec<Float, 16> &b)
2576 return vreinterpretq_f32_s32(
2577 vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)));
2580#ifdef SIMD_64BIT_TYPES
2581static SIMD_INLINE Vec<Double, 16> bit_or(const Vec<Double, 16> &a,
2582 const Vec<Double, 16> &b)
2584 return vreinterpretq_f64_s64(
2585 vorrq_s64(vreinterpretq_s64_f64(a), vreinterpretq_s64_f64(b)));
2593template <typename T>
2594static SIMD_INLINE Vec<T, 16> bit_andnot(const Vec<T, 16> &a,
2595 const Vec<T, 16> &b)
2597 return bit_and(bit_not(a), b);
2604SIMDVEC_NEON_BINARY_ALLINT(bit_xor, veorq)
2606static SIMD_INLINE Vec<Float, 16> bit_xor(const Vec<Float, 16> &a,
2607 const Vec<Float, 16> &b)
2609 return vreinterpretq_f32_s32(
2610 veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)));
2613#ifdef SIMD_64BIT_TYPES
2614static SIMD_INLINE Vec<Double, 16> bit_xor(const Vec<Double, 16> &a,
2615 const Vec<Double, 16> &b)
2617 return vreinterpretq_f64_s64(
2618 veorq_s64(vreinterpretq_s64_f64(a), vreinterpretq_s64_f64(b)));
2626SIMDVEC_NEON_UNARY(bit_not, Byte, vmvnq, u8)
2627SIMDVEC_NEON_UNARY(bit_not, SignedByte, vmvnq, s8)
2628SIMDVEC_NEON_UNARY(bit_not, Word, vmvnq, u16)
2629SIMDVEC_NEON_UNARY(bit_not, Short, vmvnq, s16)
2630SIMDVEC_NEON_UNARY(bit_not, Int, vmvnq, s32)
2632static SIMD_INLINE Vec<Float, 16> bit_not(const Vec<Float, 16> &a)
2634 return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a)));
2637#ifdef SIMD_64BIT_TYPES
2638static SIMD_INLINE Vec<Long, 16> bit_not(const Vec<Long, 16> &a)
2640 return vreinterpretq_s64_u32(vmvnq_u32(vreinterpretq_u32_s64(a)));
2642static SIMD_INLINE Vec<Double, 16> bit_not(const Vec<Double, 16> &a)
2644 return vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a)));
2652SIMDVEC_NEON_BINARY(avg, Byte, vrhaddq, u8)
2653SIMDVEC_NEON_BINARY(avg, SignedByte, vrhaddq, s8)
2654SIMDVEC_NEON_BINARY(avg, Word, vrhaddq, u16)
2655SIMDVEC_NEON_BINARY(avg, Short, vrhaddq, s16)
2656SIMDVEC_NEON_BINARY(avg, Int, vrhaddq, s32)
2658static SIMD_INLINE Vec<Float, 16> avg(const Vec<Float, 16> &a,
2659 const Vec<Float, 16> &b)
2661 return vmulq_n_f32(vaddq_f32(a, b), 0.5f);
2664#ifdef SIMD_64BIT_TYPES
2665static SIMD_INLINE Vec<Long, 16> avg(const Vec<Long, 16> &a,
2666 const Vec<Long, 16> &b)
2671 return vsubq_s64(vorrq_s64(a, b), vshrq_n_s64(veorq_s64(a, b), 1));
2673static SIMD_INLINE Vec<Double, 16> avg(const Vec<Double, 16> &a,
2674 const Vec<Double, 16> &b)
2676 return vmulq_n_f64(vaddq_f64(a, b), 0.5);
2689static SIMD_INLINE float32x2_t vorr_f32(float32x2_t a, float32x2_t b)
2691 return vreinterpret_f32_s32(
2692 vorr_s32(vreinterpret_s32_f32(a), vreinterpret_s32_f32(b)));
2697#define SIMDVEC_NEON_TESTALLZEROS(T, NEON_SUF) \
2698 static SIMD_INLINE bool test_all_zeros(const Vec<T, 16> &a) \
2700 uint32x4_t au = vreinterpretq_u32_##NEON_SUF(a); \
2701 uint32x2_t tmp = vorr_u32(vget_low_u32(au), vget_high_u32(au)); \
2702 return !vget_lane_u32(vpmax_u32(tmp, tmp), 0); \
2705SIMDVEC_NEON_TESTALLZEROS(Byte, u8)
2706SIMDVEC_NEON_TESTALLZEROS(SignedByte, s8)
2707SIMDVEC_NEON_TESTALLZEROS(Word, u16)
2708SIMDVEC_NEON_TESTALLZEROS(Short, s16)
2709SIMDVEC_NEON_TESTALLZEROS(Int, s32)
2710SIMDVEC_NEON_TESTALLZEROS(Float, f32)
2711#ifdef SIMD_64BIT_TYPES
2712SIMDVEC_NEON_TESTALLZEROS(Long, s64)
2713SIMDVEC_NEON_TESTALLZEROS(Double, f64)
2716#undef SIMDVEC_NEON_TESTALLZEROS
2722template <typename T>
2723static SIMD_INLINE bool test_all_ones(const Vec<T, 16> &a)
2725 return test_all_zeros(bit_not(a));
2734#define SIMDVEC_NEON_REVERSE(T, NEON_SUF) \
2735 static SIMD_INLINE Vec<T, 16> reverse(const Vec<T, 16> &a) \
2737 const auto t = vrev64q_##NEON_SUF(a); \
2738 return vcombine_##NEON_SUF(vget_high_##NEON_SUF(t), \
2739 vget_low_##NEON_SUF(t)); \
2742SIMDVEC_NEON_REVERSE(Byte, u8)
2743SIMDVEC_NEON_REVERSE(SignedByte, s8)
2744SIMDVEC_NEON_REVERSE(Word, u16)
2745SIMDVEC_NEON_REVERSE(Short, s16)
2746SIMDVEC_NEON_REVERSE(Int, s32)
2747SIMDVEC_NEON_REVERSE(Float, f32)
2748#ifdef SIMD_64BIT_TYPES
2749static SIMD_INLINE Vec<Long, 16> reverse(const Vec<Long, 16> &a)
2751 return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
2753static SIMD_INLINE Vec<Double, 16> reverse(const Vec<Double, 16> &a)
2755 return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
2759#undef SIMDVEC_NEON_REVERSE
2767static SIMD_INLINE uint64_t msb2int(const Vec<Byte, 16> &a)
2776 uint8x16_t high_bits = vshrq_n_u8(a, 7);
2781 uint16x8_t paired16 = vsraq_n_u16(vreinterpretq_u16_u8(high_bits),
2782 vreinterpretq_u16_u8(high_bits), 7);
2785 uint32x4_t paired32 = vsraq_n_u32(vreinterpretq_u32_u16(paired16),
2786 vreinterpretq_u32_u16(paired16), 14);
2788 uint64x2_t paired64 = vsraq_n_u64(vreinterpretq_u64_u32(paired32),
2789 vreinterpretq_u64_u32(paired32), 28);
2792 return vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 0) |
2793 ((int) vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 8) << 8);
2796static SIMD_INLINE uint64_t msb2int(const Vec<SignedByte, 16> &a)
2799 return msb2int(reinterpret(a, OutputType<Byte>()));
2802static SIMD_INLINE uint64_t msb2int(const Vec<Word, 16> &a)
2808 uint16x8_t high_bits = vshrq_n_u16(a, 15);
2811 uint32x4_t paired32 = vsraq_n_u32(vreinterpretq_u32_u16(high_bits),
2812 vreinterpretq_u32_u16(high_bits), 15);
2814 uint64x2_t paired64 = vsraq_n_u64(vreinterpretq_u64_u32(paired32),
2815 vreinterpretq_u64_u32(paired32), 30);
2817 return (vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 0) & 0xf) |
2818 (vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 8) << 4);
2821static SIMD_INLINE uint64_t msb2int(const Vec<Short, 16> &a)
2824 return msb2int(reinterpret(a, OutputType<Word>()));
2827static SIMD_INLINE uint64_t msb2int(const Vec<Int, 16> &a)
2833 uint32x4_t high_bits = vshrq_n_u32(vreinterpretq_u32_s32(a), 31);
2836 uint64x2_t paired64 = vsraq_n_u64(vreinterpretq_u64_u32(high_bits),
2837 vreinterpretq_u64_u32(high_bits), 31);
2839 return (vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 0) & 0x3) |
2840 ((vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 8) & 0x3) << 2);
2843static SIMD_INLINE uint64_t msb2int(const Vec<Float, 16> &a)
2846 return msb2int(reinterpret(a, OutputType<Int>()));
2849#ifdef SIMD_64BIT_TYPES
2850static SIMD_INLINE uint64_t msb2int(const Vec<Long, 16> &a)
2853 uint64x2_t high_bits = vshrq_n_u64(vreinterpretq_u64_s64(a), 63);
2855 return vgetq_lane_u8(vreinterpretq_u8_u64(high_bits), 0) |
2856 (vgetq_lane_u8(vreinterpretq_u8_u64(high_bits), 8) << 1);
2858static SIMD_INLINE uint64_t msb2int(const Vec<Double, 16> &a)
2861 return msb2int(reinterpret(a, OutputType<Long>()));
2871static SIMD_INLINE Vec<Byte, 16> int2msb(const uint64_t a, OutputType<Byte>,
2874 uint8x8_t aVecLo = vdup_n_u8(a & 0xff);
2875 uint8x8_t aVecHi = vdup_n_u8((a >> 8) & 0xff);
2876 uint8x16_t aVec = vcombine_u8(aVecLo, aVecHi);
2878 int8x16_t shiftAmounts = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
2879 uint8x16_t shifted = vshlq_u8(aVec, shiftAmounts);
2880 return vandq_u8(shifted, vdupq_n_u8(0x80));
2883static SIMD_INLINE Vec<SignedByte, 16> int2msb(const uint64_t a,
2884 OutputType<SignedByte>,
2887 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<16>()),
2888 OutputType<SignedByte>());
2891static SIMD_INLINE Vec<Word, 16> int2msb(const uint64_t a, OutputType<Word>,
2894 uint16x8_t aVec = vdupq_n_u16(a & 0xff);
2896 int16x8_t shiftAmounts = {15, 14, 13, 12, 11, 10, 9, 8};
2897 uint16x8_t shifted = vshlq_u16(aVec, shiftAmounts);
2898 return vandq_u16(shifted, vdupq_n_u16(0x8000));
2901static SIMD_INLINE Vec<Short, 16> int2msb(const uint64_t a, OutputType<Short>,
2904 return reinterpret(int2msb(a, OutputType<Word>(), Integer<16>()),
2905 OutputType<Short>());
2908static SIMD_INLINE Vec<Int, 16> int2msb(const uint64_t a, OutputType<Int>,
2911 int32x4_t aVec = vdupq_n_s32(a & 0xf);
2913 int32x4_t shiftAmounts = {31, 30, 29, 28};
2914 int32x4_t shifted = vshlq_s32(aVec, shiftAmounts);
2915 return vandq_s32(shifted, vdupq_n_s32(0x80000000));
2918static SIMD_INLINE Vec<Float, 16> int2msb(const uint64_t a, OutputType<Float>,
2921 return reinterpret(int2msb(a, OutputType<Int>(), Integer<16>()),
2922 OutputType<Float>());
2925#ifdef SIMD_64BIT_TYPES
2926static SIMD_INLINE Vec<Long, 16> int2msb(const uint64_t a, OutputType<Long>,
2929 int64x2_t aVec = vdupq_n_s64(a & 0x3);
2931 int64x2_t shiftAmounts = {63, 62};
2932 int64x2_t shifted = vshlq_s64(aVec, shiftAmounts);
2933 int64x2_t result = vandq_s64(shifted, vdupq_n_s64(0x8000000000000000));
2936static SIMD_INLINE Vec<Double, 16> int2msb(const uint64_t a, OutputType<Double>,
2939 return reinterpret(int2msb(a, OutputType<Long>(), Integer<16>()),
2940 OutputType<Double>());
2950static SIMD_INLINE Vec<Byte, 16> int2bits(const uint64_t a, OutputType<Byte>,
2953 uint8x8_t aVecLo = vdup_n_u8(a & 0xff);
2954 uint8x8_t aVecHi = vdup_n_u8((a >> 8) & 0xff);
2955 uint8x16_t aVec = vcombine_u8(aVecLo, aVecHi);
2956 uint8x16_t sel = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2957 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
2958 return vtstq_u8(aVec, sel);
2961static SIMD_INLINE Vec<SignedByte, 16> int2bits(const uint64_t a,
2962 OutputType<SignedByte>,
2965 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<16>()),
2966 OutputType<SignedByte>());
2969static SIMD_INLINE Vec<Word, 16> int2bits(const uint64_t a, OutputType<Word>,
2972 uint16x8_t aVec = vdupq_n_u16(a & 0xff);
2973 uint16x8_t sel = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
2974 return vtstq_u16(aVec, sel);
2977static SIMD_INLINE Vec<Short, 16> int2bits(const uint64_t a, OutputType<Short>,
2980 return reinterpret(int2bits(a, OutputType<Word>(), Integer<16>()),
2981 OutputType<Short>());
2984static SIMD_INLINE Vec<Int, 16> int2bits(const uint64_t a, OutputType<Int>,
2987 int32x4_t aVec = vdupq_n_s32(a & 0xf);
2988 int32x4_t sel = {0x01, 0x02, 0x04, 0x08};
2989 return vreinterpretq_s32_u32(vtstq_s32(aVec, sel));
2992static SIMD_INLINE Vec<Float, 16> int2bits(const uint64_t a, OutputType<Float>,
2995 return reinterpret(int2bits(a, OutputType<Int>(), Integer<16>()),
2996 OutputType<Float>());
2999#ifdef SIMD_64BIT_TYPES
3000static SIMD_INLINE Vec<Long, 16> int2bits(const uint64_t a, OutputType<Long>,
3003 int64x2_t aVec = vdupq_n_s64(a & 0xf);
3004 int64x2_t sel = {0x01, 0x02};
3005 return vreinterpretq_s64_u64(vtstq_s64(aVec, sel));
3007static SIMD_INLINE Vec<Double, 16> int2bits(const uint64_t a,
3008 OutputType<Double>, Integer<16>)
3010 return reinterpret(int2bits(a, OutputType<Long>(), Integer<16>()),
3011 OutputType<Double>());
3021static SIMD_INLINE Vec<Byte, 16> iota(OutputType<Byte>, Integer<16>)
3023 uint8x16_t res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3027static SIMD_INLINE Vec<SignedByte, 16> iota(OutputType<SignedByte>, Integer<16>)
3029 int8x16_t res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3033static SIMD_INLINE Vec<Word, 16> iota(OutputType<Word>, Integer<16>)
3035 uint16x8_t res = {0, 1, 2, 3, 4, 5, 6, 7};
3039static SIMD_INLINE Vec<Short, 16> iota(OutputType<Short>, Integer<16>)
3041 int16x8_t res = {0, 1, 2, 3, 4, 5, 6, 7};
3045static SIMD_INLINE Vec<Int, 16> iota(OutputType<Int>, Integer<16>)
3047 int32x4_t res = {0, 1, 2, 3};
3051static SIMD_INLINE Vec<Float, 16> iota(OutputType<Float>, Integer<16>)
3053 float32x4_t res = {0.0f, 1.0f, 2.0f, 3.0f};
3057#ifdef SIMD_64BIT_TYPES
3058static SIMD_INLINE Vec<Long, 16> iota(OutputType<Long>, Integer<16>)
3060 int64x2_t res = {0, 1};
3063static SIMD_INLINE Vec<Double, 16> iota(OutputType<Double>, Integer<16>)
3065 float64x2_t res = {0.0, 1.0};
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161