T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
base_impl_neon16.H
1// ===========================================================================
2//
3// encapsulation for ARM NEON vector extension
4// inspired by Agner Fog's C++ Vector Class Library
5// http://www.agner.org/optimize/#vectorclass
6// (VCL License: GNU General Public License Version 3,
7// http://www.gnu.org/licenses/gpl-3.0.en.html)
8//
9// This source code file is part of the following software:
10//
11// - the low-level C++ template SIMD library
12// - the SIMD implementation of the MinWarping and the 2D-Warping methods
13// for local visual homing.
14//
15// The software is provided based on the accompanying license agreement in the
16// file LICENSE.md.
17// The software is provided "as is" without any warranty by the licensor and
18// without any liability of the licensor, and the software may not be
19// distributed by the licensee; see the license agreement for details.
20//
21// (C) Ralf Möller
22// Computer Engineering
23// Faculty of Technology
24// Bielefeld University
25// www.ti.uni-bielefeld.de
26//
27// ===========================================================================
28
29// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
30// namespace
31
32// NOTES:
33//
34// echo | gcc -E -dM -mcpu=cortex-a9 -mfpu=neon - | more
35// echo | arm-linux-gnueabihf-gcc -E -dM -mcpu=cortex-a15 -mfpu=neon - | more
36//
37// -mfpu=neon
38// -mfpu=neon-fp16
39//
40// GCC 4.9:
41// GCC now supports Cortex-A12 and the Cortex-R7 through the
42// -mcpu=cortex-a12 and -mcpu=cortex-r7 options.
43//
44// GCC now has tuning for the Cortex-A57 and Cortex-A53 through the
45// -mcpu=cortex-a57 and -mcpu=cortex-a53 options.
46//
47// Initial big.LITTLE tuning support for the combination of Cortex-A57
48// and Cortex-A53 was added through the -mcpu=cortex-a57.cortex-a53
49// option. Similar support was added for the combination of Cortex-A15
50// and Cortex-A7 through the -mcpu=cortex-a15.cortex-a7 option.
51
52#pragma once
53#ifndef SIMD_VEC_BASE_IMPL_NEON_16_H_
54#define SIMD_VEC_BASE_IMPL_NEON_16_H_
55
56#include "../alloc.H"
57#include "../defs.H"
58#include "../types.H"
59#include "../vec.H"
60#include "intrins_neon.H"
61
62#include <algorithm>
63#include <cstddef>
64#include <cstdint>
65#include <type_traits>
66
67#if defined(SIMDVEC_NEON_ENABLE) && defined(_SIMD_VEC_16_AVAIL_) && \
68 !defined(SIMDVEC_SANDBOX)
69
70namespace simd {
71namespace internal {
72namespace base {
73// =========================================================================
74// type templates
75// =========================================================================
76
77// -------------------------------------------------------------------------
78// default vector type collection
79// -------------------------------------------------------------------------
80
81template <typename T>
82struct _NEONRegType;
83// clang-format off
84template <> struct _NEONRegType<Byte> { using Type = uint8x16_t; };
85template <> struct _NEONRegType<SignedByte> { using Type = int8x16_t; };
86template <> struct _NEONRegType<Word> { using Type = uint16x8_t; };
87template <> struct _NEONRegType<Short> { using Type = int16x8_t; };
88template <> struct _NEONRegType<Int> { using Type = int32x4_t; };
89template <> struct _NEONRegType<Float> { using Type = float32x4_t; };
90#ifdef SIMD_64BIT_TYPES
91template <> struct _NEONRegType<Long> { using Type = int64x2_t; };
92template <> struct _NEONRegType<Double> { using Type = float64x2_t; };
93#endif
94// clang-format on
95
96template <typename T>
97using NEONRegType = typename _NEONRegType<T>::Type;
98
99// -------------------------------------------------------------------------
100// 64bit array type collection
101// -------------------------------------------------------------------------
102
103template <size_t N, typename T>
104struct SIMDVecNeonArray64;
105
106#define SIMDVEC_NEON_ARRAY64(NUM, T, NEON_T) \
107 template <> \
108 struct SIMDVecNeonArray64<NUM, T> \
109 { \
110 using Type = NEON_T##x##NUM##_t; \
111 using ValType = NEON_T##_t; \
112 };
113
114#define SIMDVEC_NEON_ARRAY64_ALLNUM(T, NEON_T) \
115 SIMDVEC_NEON_ARRAY64(1, T, NEON_T) \
116 SIMDVEC_NEON_ARRAY64(2, T, NEON_T) \
117 SIMDVEC_NEON_ARRAY64(3, T, NEON_T) \
118 SIMDVEC_NEON_ARRAY64(4, T, NEON_T)
119
120SIMDVEC_NEON_ARRAY64_ALLNUM(Byte, uint8x8)
121SIMDVEC_NEON_ARRAY64_ALLNUM(SignedByte, int8x8)
122SIMDVEC_NEON_ARRAY64_ALLNUM(Word, uint16x4)
123SIMDVEC_NEON_ARRAY64_ALLNUM(Short, int16x4)
124SIMDVEC_NEON_ARRAY64_ALLNUM(Int, int32x2)
125SIMDVEC_NEON_ARRAY64_ALLNUM(Float, float32x2)
126#ifdef SIMD_64BIT_TYPES
127SIMDVEC_NEON_ARRAY64_ALLNUM(Double, float64x1)
128#endif
129
130#undef SIMDVEC_NEON_ARRAY64
131#undef SIMDVEC_NEON_ARRAY64_ALLNUM
132
133} // namespace base
134} // namespace internal
135
136// =========================================================================
137// Vec instantiation for NEON
138// =========================================================================
139
140template <typename T>
141class Vec<T, 16>
142{
143 using RegType = internal::base::NEONRegType<T>;
144 RegType reg = {};
145
146public:
147 using Type = T;
148 static constexpr size_t elements = 16 / sizeof(T);
149 static constexpr size_t elems = elements;
150 static constexpr size_t bytes = 16;
151
152 Vec() = default;
153 Vec(const RegType &x) { reg = x; }
154 Vec &operator=(const RegType &x)
155 {
156 reg = x;
157 return *this;
158 }
159 operator RegType() const { return reg; }
160 // 29. Nov 22 (Jonas Keller):
161 // defined operators new and delete to ensure proper alignment, since
162 // the default new and delete are not guaranteed to do so before C++17
163 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
164 void operator delete(void *p) { aligned_free(p); }
165 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
166 void operator delete[](void *p) { aligned_free(p); }
167 // 05. Sep 23 (Jonas Keller): added allocator
168 using allocator = aligned_allocator<Vec<T, bytes>, bytes>;
169};
170
171namespace internal {
172namespace base {
173
174// =========================================================================
175// macros for common functions
176// =========================================================================
177
178// -------------------------------------------------------------------------
179// binary functions (same input and output type)
180// -------------------------------------------------------------------------
181
182// wrapper for arbitrary binary function
183#define SIMDVEC_NEON_BINARY(FCT, TYPE, NEON_FCT, NEON_SUF) \
184 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
185 const Vec<TYPE, 16> &b) \
186 { \
187 return NEON_FCT##_##NEON_SUF(a, b); \
188 }
189
190#ifdef SIMD_64BIT_TYPES
191#define SIMDVEC_NEON_BINARY_ALLINT(FCT, NEON_FCT) \
192 SIMDVEC_NEON_BINARY(FCT, Byte, NEON_FCT, u8) \
193 SIMDVEC_NEON_BINARY(FCT, SignedByte, NEON_FCT, s8) \
194 SIMDVEC_NEON_BINARY(FCT, Word, NEON_FCT, u16) \
195 SIMDVEC_NEON_BINARY(FCT, Short, NEON_FCT, s16) \
196 SIMDVEC_NEON_BINARY(FCT, Int, NEON_FCT, s32) \
197 SIMDVEC_NEON_BINARY(FCT, Long, NEON_FCT, s64)
198#else
199#define SIMDVEC_NEON_BINARY_ALLINT(FCT, NEON_FCT) \
200 SIMDVEC_NEON_BINARY(FCT, Byte, NEON_FCT, u8) \
201 SIMDVEC_NEON_BINARY(FCT, SignedByte, NEON_FCT, s8) \
202 SIMDVEC_NEON_BINARY(FCT, Word, NEON_FCT, u16) \
203 SIMDVEC_NEON_BINARY(FCT, Short, NEON_FCT, s16) \
204 SIMDVEC_NEON_BINARY(FCT, Int, NEON_FCT, s32)
205#endif
206
207#ifdef SIMD_64BIT_TYPES
208#define SIMDVEC_NEON_BINARY_ALLFLOAT(FCT, NEON_FCT) \
209 SIMDVEC_NEON_BINARY(FCT, Float, NEON_FCT, f32) \
210 SIMDVEC_NEON_BINARY(FCT, Double, NEON_FCT, f64)
211#else
212#define SIMDVEC_NEON_BINARY_ALLFLOAT(FCT, NEON_FCT) \
213 SIMDVEC_NEON_BINARY(FCT, Float, NEON_FCT, f32)
214#endif
215
216#define SIMDVEC_NEON_BINARY_ALL(FCT, NEON_FCT) \
217 SIMDVEC_NEON_BINARY_ALLINT(FCT, NEON_FCT) \
218 SIMDVEC_NEON_BINARY_ALLFLOAT(FCT, NEON_FCT)
219
220// -------------------------------------------------------------------------
221// unary functions
222// -------------------------------------------------------------------------
223
224#define SIMDVEC_NEON_UNARY(FCT, TYPE, NEON_FCT, NEON_SUF) \
225 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a) \
226 { \
227 return NEON_FCT##_##NEON_SUF(a); \
228 }
229
230// #########################################################################
231// #########################################################################
232// #########################################################################
233
234// =========================================================================
235// Vec function instantiations or overloading for NEON
236// =========================================================================
237
238// -------------------------------------------------------------------------
239// reinterpretation casts
240// -------------------------------------------------------------------------
241
242// wrapper for vreinterpretq
243#define SIMDVEC_NEON_REINTERP(TDST, NEON_TDST, TSRC, NEON_TSRC) \
244 static SIMD_INLINE Vec<TDST, 16> reinterpret(const Vec<TSRC, 16> &vec, \
245 OutputType<TDST>) \
246 { \
247 return vreinterpretq_##NEON_TDST##_##NEON_TSRC(vec); \
248 }
249
250// wrapper for all dst types and same source type
251#ifdef SIMD_64BIT_TYPES
252#define SIMDVEC_NEON_REINTERP_ALLDST(TSRC, NEON_TSRC) \
253 SIMDVEC_NEON_REINTERP(Byte, u8, TSRC, NEON_TSRC) \
254 SIMDVEC_NEON_REINTERP(SignedByte, s8, TSRC, NEON_TSRC) \
255 SIMDVEC_NEON_REINTERP(Word, u16, TSRC, NEON_TSRC) \
256 SIMDVEC_NEON_REINTERP(Short, s16, TSRC, NEON_TSRC) \
257 SIMDVEC_NEON_REINTERP(Int, s32, TSRC, NEON_TSRC) \
258 SIMDVEC_NEON_REINTERP(Long, s64, TSRC, NEON_TSRC) \
259 SIMDVEC_NEON_REINTERP(Float, f32, TSRC, NEON_TSRC) \
260 SIMDVEC_NEON_REINTERP(Double, f64, TSRC, NEON_TSRC)
261#else
262#define SIMDVEC_NEON_REINTERP_ALLDST(TSRC, NEON_TSRC) \
263 SIMDVEC_NEON_REINTERP(Byte, u8, TSRC, NEON_TSRC) \
264 SIMDVEC_NEON_REINTERP(SignedByte, s8, TSRC, NEON_TSRC) \
265 SIMDVEC_NEON_REINTERP(Word, u16, TSRC, NEON_TSRC) \
266 SIMDVEC_NEON_REINTERP(Short, s16, TSRC, NEON_TSRC) \
267 SIMDVEC_NEON_REINTERP(Int, s32, TSRC, NEON_TSRC) \
268 SIMDVEC_NEON_REINTERP(Float, f32, TSRC, NEON_TSRC)
269#endif
270
271// wrapper for all dst and src types
272SIMDVEC_NEON_REINTERP_ALLDST(Byte, u8)
273SIMDVEC_NEON_REINTERP_ALLDST(SignedByte, s8)
274SIMDVEC_NEON_REINTERP_ALLDST(Word, u16)
275SIMDVEC_NEON_REINTERP_ALLDST(Short, s16)
276SIMDVEC_NEON_REINTERP_ALLDST(Int, s32)
277SIMDVEC_NEON_REINTERP_ALLDST(Float, f32)
278#ifdef SIMD_64BIT_TYPES
279SIMDVEC_NEON_REINTERP_ALLDST(Long, s64)
280SIMDVEC_NEON_REINTERP_ALLDST(Double, f64)
281#endif
282
283#undef SIMDVEC_NEON_REINTERP_ALLDST
284#undef SIMDVEC_NEON_REINTERP
285
286// -------------------------------------------------------------------------
287// convert (without changes in the number of of elements)
288// -------------------------------------------------------------------------
289
290// conversion seems to be saturated in all cases (specified by the
291// rounding mode):
292// http://stackoverflow.com/questions/24546927/
293// behavior-of-arm-neon-float-integer-conversion-with-overflow
294
295// saturated
296// TODO: rounding in cvts (float->int)? +0.5?
297// TODO: (NOT the same behavior as in SIMDVecBaseImplIntel16.H
298// TODO: float->int always uses round towards zero = trunc?)
299// TODO: cvts: should we saturate in the same way as for Intel?
300// TODO: (Intel saturates to max. float which is convertible to int,
301// TODO: NEON saturates to 0x7fffffff)
302static SIMD_INLINE Vec<Int, 16> cvts(const Vec<Float, 16> &a, OutputType<Int>)
303{
304 return vcvtq_s32_f32(a);
305}
306
307// saturation is not necessary in this case
308static SIMD_INLINE Vec<Float, 16> cvts(const Vec<Int, 16> &a, OutputType<Float>)
309{
310 return vcvtq_f32_s32(a);
311}
312
313#ifdef SIMD_64BIT_TYPES
314static SIMD_INLINE Vec<Long, 16> cvts(const Vec<Double, 16> &a,
315 OutputType<Long>)
316{
317 return vcvtq_s64_f64(a);
318}
319
320static SIMD_INLINE Vec<Double, 16> cvts(const Vec<Long, 16> &a,
321 OutputType<Double>)
322{
323 return vcvtq_f64_s64(a);
324}
325#endif
326
327// -------------------------------------------------------------------------
328// setzero
329// -------------------------------------------------------------------------
330
331#define SIMDVEC_NEON_SETZERO(TYPE, NEON_SUF) \
332 static SIMD_INLINE Vec<TYPE, 16> setzero(OutputType<TYPE>, Integer<16>) \
333 { \
334 return vmovq_n##_##NEON_SUF(TYPE(0)); \
335 }
336
337SIMDVEC_NEON_SETZERO(Byte, u8)
338SIMDVEC_NEON_SETZERO(SignedByte, s8)
339SIMDVEC_NEON_SETZERO(Word, u16)
340SIMDVEC_NEON_SETZERO(Short, s16)
341SIMDVEC_NEON_SETZERO(Int, s32)
342SIMDVEC_NEON_SETZERO(Float, f32)
343#ifdef SIMD_64BIT_TYPES
344SIMDVEC_NEON_SETZERO(Long, s64)
345SIMDVEC_NEON_SETZERO(Double, f64)
346#endif
347
348#undef SIMDVEC_NEON_SETZERO
349
350// -------------------------------------------------------------------------
351// set1
352// -------------------------------------------------------------------------
353
354#define SIMDVEC_NEON_SET1(TYPE, NEON_SUF) \
355 static SIMD_INLINE Vec<TYPE, 16> set1(TYPE a, Integer<16>) \
356 { \
357 return vdupq_n##_##NEON_SUF(a); \
358 }
359
360SIMDVEC_NEON_SET1(Byte, u8)
361SIMDVEC_NEON_SET1(SignedByte, s8)
362SIMDVEC_NEON_SET1(Word, u16)
363SIMDVEC_NEON_SET1(Short, s16)
364SIMDVEC_NEON_SET1(Int, s32)
365SIMDVEC_NEON_SET1(Float, f32)
366#ifdef SIMD_64BIT_TYPES
367SIMDVEC_NEON_SET1(Long, s64)
368SIMDVEC_NEON_SET1(Double, f64)
369#endif
370
371#undef SIMDVEC_NEON_SET1
372
373// -------------------------------------------------------------------------
374// load
375// -------------------------------------------------------------------------
376
377#define SIMDVEC_NEON_LOAD(TYPE, NEON_SUF) \
378 static SIMD_INLINE Vec<TYPE, 16> load(const TYPE *const p, Integer<16>) \
379 { \
380 return vld1q##_##NEON_SUF(p); \
381 } \
382 static SIMD_INLINE Vec<TYPE, 16> loadu(const TYPE *const p, Integer<16>) \
383 { \
384 return vld1q##_##NEON_SUF(p); \
385 }
386
387SIMDVEC_NEON_LOAD(Byte, u8)
388SIMDVEC_NEON_LOAD(SignedByte, s8)
389SIMDVEC_NEON_LOAD(Word, u16)
390SIMDVEC_NEON_LOAD(Short, s16)
391SIMDVEC_NEON_LOAD(Int, s32)
392SIMDVEC_NEON_LOAD(Float, f32)
393#ifdef SIMD_64BIT_TYPES
394SIMDVEC_NEON_LOAD(Long, s64)
395SIMDVEC_NEON_LOAD(Double, f64)
396#endif
397
398#undef SIMDVEC_NEON_LOAD
399
400// -------------------------------------------------------------------------
401// store
402// -------------------------------------------------------------------------
403
404#define SIMDVEC_NEON_STORE(TYPE, NEON_SUF) \
405 static SIMD_INLINE void store(TYPE *const p, const Vec<TYPE, 16> &a) \
406 { \
407 return vst1q##_##NEON_SUF(p, a); \
408 } \
409 static SIMD_INLINE void storeu(TYPE *const p, const Vec<TYPE, 16> &a) \
410 { \
411 return vst1q##_##NEON_SUF(p, a); \
412 } \
413 static SIMD_INLINE void stream_store(TYPE *const p, const Vec<TYPE, 16> &a) \
414 { \
415 return vst1q##_##NEON_SUF(p, a); \
416 }
417
418SIMDVEC_NEON_STORE(Byte, u8)
419SIMDVEC_NEON_STORE(SignedByte, s8)
420SIMDVEC_NEON_STORE(Word, u16)
421SIMDVEC_NEON_STORE(Short, s16)
422SIMDVEC_NEON_STORE(Int, s32)
423SIMDVEC_NEON_STORE(Float, f32)
424#ifdef SIMD_64BIT_TYPES
425SIMDVEC_NEON_STORE(Long, s64)
426SIMDVEC_NEON_STORE(Double, f64)
427#endif
428
429#undef SIMDVEC_NEON_STORE
430
431// -------------------------------------------------------------------------
432// fences
433// -------------------------------------------------------------------------
434
435// http://infocenter.arm.com/help/
436// index.jsp?topic=/com.arm.doc.faqs/ka14552.html
437// TODO: is this portable to clang?
438
439// NOTE: implemented as full barrier
440static SIMD_INLINE void lfence()
441{
442 SIMD_FULL_MEMBARRIER;
443}
444
445// NOTE: implemented as full barrier
446static SIMD_INLINE void sfence()
447{
448 SIMD_FULL_MEMBARRIER;
449}
450
451// NOTE: implemented as full barrier
452static SIMD_INLINE void mfence()
453{
454 SIMD_FULL_MEMBARRIER;
455}
456
457// -------------------------------------------------------------------------
458// extract: with template parameter for immediate argument
459// -------------------------------------------------------------------------
460
461#define SIMDVEC_NEON_EXTRACT(TYPE, NEON_SUF) \
462 template <size_t COUNT> \
463 static SIMD_INLINE TYPE extract(const Vec<TYPE, 16> &a) \
464 { \
465 SIMD_IF_CONSTEXPR (COUNT < Vec<TYPE, 16>::elements) { \
466 return vgetq_lane##_##NEON_SUF(a, COUNT); \
467 } else { \
468 return TYPE(0); \
469 } \
470 }
471
472SIMDVEC_NEON_EXTRACT(Byte, u8)
473SIMDVEC_NEON_EXTRACT(SignedByte, s8)
474SIMDVEC_NEON_EXTRACT(Word, u16)
475SIMDVEC_NEON_EXTRACT(Short, s16)
476SIMDVEC_NEON_EXTRACT(Int, s32)
477SIMDVEC_NEON_EXTRACT(Float, f32)
478#ifdef SIMD_64BIT_TYPES
479SIMDVEC_NEON_EXTRACT(Long, s64)
480SIMDVEC_NEON_EXTRACT(Double, f64)
481#endif
482
483#undef SIMDVEC_NEON_EXTRACT
484
485// -------------------------------------------------------------------------
486// add
487// -------------------------------------------------------------------------
488
489SIMDVEC_NEON_BINARY_ALL(add, vaddq)
490
491// -------------------------------------------------------------------------
492// adds
493// -------------------------------------------------------------------------
494
495SIMDVEC_NEON_BINARY_ALLINT(adds, vqaddq)
496// float NOT saturated
497SIMDVEC_NEON_BINARY(adds, Float, vaddq, f32)
498#ifdef SIMD_64BIT_TYPES
499SIMDVEC_NEON_BINARY(adds, Double, vaddq, f64)
500#endif
501
502// -------------------------------------------------------------------------
503// sub
504// -------------------------------------------------------------------------
505
506SIMDVEC_NEON_BINARY_ALL(sub, vsubq)
507
508// -------------------------------------------------------------------------
509// subs
510// -------------------------------------------------------------------------
511
512SIMDVEC_NEON_BINARY_ALLINT(subs, vqsubq)
513// float NOT saturated
514SIMDVEC_NEON_BINARY(subs, Float, vsubq, f32)
515#ifdef SIMD_64BIT_TYPES
516SIMDVEC_NEON_BINARY(subs, Double, vsubq, f64)
517#endif
518
519// -------------------------------------------------------------------------
520// neg (negate = two's complement or unary minus), only signed types
521// -------------------------------------------------------------------------
522
523SIMDVEC_NEON_UNARY(neg, SignedByte, vnegq, s8)
524SIMDVEC_NEON_UNARY(neg, Short, vnegq, s16)
525SIMDVEC_NEON_UNARY(neg, Int, vnegq, s32)
526SIMDVEC_NEON_UNARY(neg, Float, vnegq, f32)
527#ifdef SIMD_64BIT_TYPES
528SIMDVEC_NEON_UNARY(neg, Long, vnegq, s64)
529SIMDVEC_NEON_UNARY(neg, Double, vnegq, f64)
530#endif
531
532// -------------------------------------------------------------------------
533// min
534// -------------------------------------------------------------------------
535
536SIMDVEC_NEON_BINARY(min, Byte, vminq, u8)
537SIMDVEC_NEON_BINARY(min, SignedByte, vminq, s8)
538SIMDVEC_NEON_BINARY(min, Word, vminq, u16)
539SIMDVEC_NEON_BINARY(min, Short, vminq, s16)
540SIMDVEC_NEON_BINARY(min, Int, vminq, s32)
541SIMDVEC_NEON_BINARY(min, Float, vminq, f32)
542#ifdef SIMD_64BIT_TYPES
543static SIMD_INLINE Vec<Long, 16> min(const Vec<Long, 16> &a,
544 const Vec<Long, 16> &b)
545{
546 // vminq_s64 does not exist
547 return vbslq_s64(vcltq_s64(a, b), a, b);
548}
549SIMDVEC_NEON_BINARY(min, Double, vminq, f64)
550#endif
551
552// -------------------------------------------------------------------------
553// max
554// -------------------------------------------------------------------------
555
556SIMDVEC_NEON_BINARY(max, Byte, vmaxq, u8)
557SIMDVEC_NEON_BINARY(max, SignedByte, vmaxq, s8)
558SIMDVEC_NEON_BINARY(max, Word, vmaxq, u16)
559SIMDVEC_NEON_BINARY(max, Short, vmaxq, s16)
560SIMDVEC_NEON_BINARY(max, Int, vmaxq, s32)
561SIMDVEC_NEON_BINARY(max, Float, vmaxq, f32)
562#ifdef SIMD_64BIT_TYPES
563static SIMD_INLINE Vec<Long, 16> max(const Vec<Long, 16> &a,
564 const Vec<Long, 16> &b)
565{
566 // vmaxq_s64 does not exist
567 return vbslq_s64(vcgtq_s64(a, b), a, b);
568}
569SIMDVEC_NEON_BINARY(max, Double, vmaxq, f64)
570#endif
571
572// -------------------------------------------------------------------------
573// mul, div
574// -------------------------------------------------------------------------
575
576SIMDVEC_NEON_BINARY(mul, Float, vmulq, f32)
577#ifdef SIMD_64BIT_TYPES
578SIMDVEC_NEON_BINARY(mul, Double, vmulq, f64)
579#endif
580
581const auto DIV_NEWTON_STEPS = 2;
582
583// adapted from Jens Froemmer's Ba thesis (2014)
584static SIMD_INLINE Vec<Float, 16> div(const Vec<Float, 16> &num,
585 const Vec<Float, 16> &denom)
586{
587 // get estimate of reciprocal of denom
588 float32x4_t reciprocal = vrecpeq_f32(denom);
589 // refine estimate using Newton-Raphson steps
590 for (size_t i = 0; i < DIV_NEWTON_STEPS; i++)
591 reciprocal = vmulq_f32(vrecpsq_f32(denom, reciprocal), reciprocal);
592 // num * (1.0 / denom)
593 return vmulq_f32(num, reciprocal);
594}
595
596#ifdef SIMD_64BIT_TYPES
597static SIMD_INLINE Vec<Double, 16> div(const Vec<Double, 16> &num,
598 const Vec<Double, 16> &denom)
599{
600 // get estimate of reciprocal of denom
601 float64x2_t reciprocal = vrecpeq_f64(denom);
602 // refine estimate using Newton-Raphson steps
603 for (size_t i = 0; i < DIV_NEWTON_STEPS; i++)
604 reciprocal = vmulq_f64(vrecpsq_f64(denom, reciprocal), reciprocal);
605 // num * (1.0 / denom)
606 return vmulq_f64(num, reciprocal);
607}
608#endif
609
610// -------------------------------------------------------------------------
611// ceil, floor, round, truncate
612// -------------------------------------------------------------------------
613
614// 25. Mar 23 (Jonas Keller): added versions for integer types
615
616// versions for integer types do nothing:
617
618template <typename T>
619static SIMD_INLINE Vec<T, 16> ceil(const Vec<T, 16> &a)
620{
621 static_assert(std::is_integral<T>::value, "");
622 return a;
623}
624
625template <typename T>
626static SIMD_INLINE Vec<T, 16> floor(const Vec<T, 16> &a)
627{
628 static_assert(std::is_integral<T>::value, "");
629 return a;
630}
631
632template <typename T>
633static SIMD_INLINE Vec<T, 16> round(const Vec<T, 16> &a)
634{
635 static_assert(std::is_integral<T>::value, "");
636 return a;
637}
638
639template <typename T>
640static SIMD_INLINE Vec<T, 16> truncate(const Vec<T, 16> &a)
641{
642 static_assert(std::is_integral<T>::value, "");
643 return a;
644}
645
646// http://www.rowleydownload.co.uk/arm/documentation/gnu/gcc/
647// ARM-NEON-Intrinsics.html
648// vrnd, only some architectures, see arm_neon.h
649
650#if __ARM_ARCH >= 8
651
652// 10. Apr 19 (rm): BINARY->UNARY, qp -> pq etc., still not tested
653SIMDVEC_NEON_UNARY(ceil, Float, vrndpq, f32)
654SIMDVEC_NEON_UNARY(floor, Float, vrndmq, f32)
655SIMDVEC_NEON_UNARY(round, Float, vrndnq, f32)
656SIMDVEC_NEON_UNARY(truncate, Float, vrndq, f32)
657
658#ifdef SIMD_64BIT_TYPES
659SIMDVEC_NEON_UNARY(ceil, Double, vrndpq, f64)
660SIMDVEC_NEON_UNARY(floor, Double, vrndmq, f64)
661SIMDVEC_NEON_UNARY(round, Double, vrndnq, f64)
662SIMDVEC_NEON_UNARY(truncate, Double, vrndq, f64)
663#endif
664
665#else
666
667static SIMD_INLINE Vec<Float, 16> truncate(const Vec<Float, 16> &a)
668{
669 // if e>=23, floating point number represents an integer, 2^23 = 8388608
670 float32x4_t limit = vmovq_n_f32(8388608.f);
671 // bool mask: no rounding required if abs(a) >= limit
672 uint32x4_t noRndReq = vcgeq_f32(vabsq_f32(a), limit);
673 // truncated result (for |a| < limit)
674 float32x4_t aTrunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
675 // select result
676 return vbslq_f32(noRndReq, a, aTrunc);
677}
678
679// https://en.wikipedia.org/wiki/Floor_and_ceiling_functions
680//
681// floor, ceil:
682// floor(x), x >= 0
683// truncate(x) = {
684// ceil(x), x < 0
685//
686// floor(x) = ceil(x) - (x in Z ? 0 : 1)
687// ceil(x) = floor(x) + (x in Z ? 0 : 1)
688
689static SIMD_INLINE Vec<Float, 16> floor(const Vec<Float, 16> &a)
690{
691 // if e>=23, floating point number represents an integer, 2^23 = 8388608
692 float32x4_t limit = vmovq_n_f32(8388608.f);
693 // bool mask: no rounding required if abs(a) >= limit
694 uint32x4_t noRndReq = vcgeq_f32(vabsq_f32(a), limit);
695 // bool mask: true if a is negative
696 uint32x4_t isNeg =
697 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
698 // truncated result (for |a| < limit)
699 float32x4_t aTrunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
700 // check if a is an integer
701 uint32x4_t isNotInt = vmvnq_u32(vceqq_f32(a, aTrunc));
702 // constant 1.0
703 float32x4_t one = vmovq_n_f32(1.0f);
704 // mask which is 1.0f for negative non-integer values, 0.0f otherwise
705 float32x4_t oneMask = vreinterpretq_f32_u32(
706 vandq_u32(vandq_u32(isNeg, isNotInt), vreinterpretq_u32_f32(one)));
707 // if negative, trunc computes ceil, to turn it into floor we sub
708 // 1 if aTrunc is non-integer
709 aTrunc = vsubq_f32(aTrunc, oneMask);
710 // select result (a or aTrunc)
711 return vbslq_f32(noRndReq, a, aTrunc);
712}
713
714static SIMD_INLINE Vec<Float, 16> ceil(const Vec<Float, 16> &a)
715{
716 // if e>=23, floating point number represents an integer, 2^23 = 8388608
717 float32x4_t limit = vmovq_n_f32(8388608.f);
718 // bool mask: no rounding required if abs(a) >= limit
719 uint32x4_t noRndReq = vcgeq_f32(vabsq_f32(a), limit);
720 // bool mask: true if a is negative
721 uint32x4_t isNotNeg =
722 vmvnq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31)));
723 // truncated result (for |a| < limit)
724 float32x4_t aTrunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
725 // check if a is an integer
726 uint32x4_t isNotInt = vmvnq_u32(vceqq_f32(a, aTrunc));
727 // constant 1.0
728 float32x4_t one = vmovq_n_f32(1.0f);
729 // mask which is 1.0f for non-negative non-integer values, 0.0f otherwise
730 float32x4_t oneMask = vreinterpretq_f32_u32(
731 vandq_u32(vandq_u32(isNotNeg, isNotInt), vreinterpretq_u32_f32(one)));
732 // if non-negative, trunc computes floor, to turn it into ceil we
733 // add 1 if aTrunc is non-integer
734 aTrunc = vaddq_f32(aTrunc, oneMask);
735 // select result (a or aTrunc)
736 return vbslq_f32(noRndReq, a, aTrunc);
737}
738
739// NOTE: rounds ties (*.5) towards infinity, different from Intel
740static SIMD_INLINE Vec<Float, 16> round(const Vec<Float, 16> &a)
741{
742 return floor(add(a, set1(Float(0.5f), Integer<16>())));
743}
744
745#endif
746
747// -------------------------------------------------------------------------
748// elementary mathematical functions
749// -------------------------------------------------------------------------
750
751// estimate of a reciprocal
752SIMDVEC_NEON_UNARY(rcp, Float, vrecpeq, f32)
753#ifdef SIMD_64BIT_TYPES
754SIMDVEC_NEON_UNARY(rcp, Double, vrecpeq, f64)
755#endif
756
757// estimate of a reverse square root
758SIMDVEC_NEON_UNARY(rsqrt, Float, vrsqrteq, f32)
759#ifdef SIMD_64BIT_TYPES
760SIMDVEC_NEON_UNARY(rsqrt, Double, vrsqrteq, f64)
761#endif
762
763const auto SQRT_NEWTON_STEPS = 2;
764
765// square root (may not be very efficient)
766static SIMD_INLINE Vec<Float, 16> sqrt(const Vec<Float, 16> &a)
767{
768 // vector with 0s, vector with 1s
769 float32x4_t zero = vmovq_n_f32(0.0f), one = vmovq_n_f32(1.0f);
770 // check for 0 to avoid div-by-0 (should also cover -0.0f)
771 uint32x4_t isZero = vceqq_f32(a, zero);
772 // avoid inf in rev. sqrt, replace 0 by 1
773 float32x4_t as = vbslq_f32(isZero, one, a);
774 // get estimate of reciprocal sqrt
775 float32x4_t rSqrt = vrsqrteq_f32(as);
776 // refine estimate using Newton-Raphson steps
777 for (size_t i = 0; i < SQRT_NEWTON_STEPS; i++)
778 rSqrt = vmulq_f32(vrsqrtsq_f32(as, vmulq_f32(rSqrt, rSqrt)), rSqrt);
779 // sqrt(a) = a * (1.0 / sqrt(a))
780 float32x4_t res = vmulq_f32(as, rSqrt);
781 // select result
782 return vbslq_f32(isZero, zero, res);
783}
784
785#ifdef SIMD_64BIT_TYPES
786static SIMD_INLINE Vec<Double, 16> sqrt(const Vec<Double, 16> &a)
787{
788 // vector with 0s, vector with 1s
789 float64x2_t zero = vmovq_n_f64(0.0), one = vmovq_n_f64(1.0);
790 // check for 0 to avoid div-by-0 (should also cover -0.0)
791 uint64x2_t isZero = vceqq_f64(a, zero);
792 // avoid inf in rev. sqrt, replace 0 by 1
793 float64x2_t as = vbslq_f64(isZero, one, a);
794 // get estimate of reciprocal sqrt
795 float64x2_t rSqrt = vrsqrteq_f64(as);
796 // refine estimate using Newton-Raphson steps
797 for (size_t i = 0; i < SQRT_NEWTON_STEPS; i++)
798 rSqrt = vmulq_f64(vrsqrtsq_f64(as, vmulq_f64(rSqrt, rSqrt)), rSqrt);
799 // sqrt(a) = a * (1.0 / sqrt(a))
800 float64x2_t res = vmulq_f64(as, rSqrt);
801 // select result
802 return vbslq_f64(isZero, zero, res);
803}
804#endif
805
806// -------------------------------------------------------------------------
807// abs
808// -------------------------------------------------------------------------
809
810// 25. Mar 25 (Jonas Keller): added abs for unsigned integers
811
812// unsigned integers
813template <typename T, SIMD_ENABLE_IF(std::is_unsigned<T>::value
814 &&std::is_integral<T>::value)>
815static SIMD_INLINE Vec<T, 16> abs(const Vec<T, 16> &a)
816{
817 return a;
818}
819
820SIMDVEC_NEON_UNARY(abs, SignedByte, vabsq, s8)
821SIMDVEC_NEON_UNARY(abs, Short, vabsq, s16)
822SIMDVEC_NEON_UNARY(abs, Int, vabsq, s32)
823SIMDVEC_NEON_UNARY(abs, Float, vabsq, f32)
824#ifdef SIMD_64BIT_TYPES
825SIMDVEC_NEON_UNARY(abs, Long, vabsq, s64)
826SIMDVEC_NEON_UNARY(abs, Double, vabsq, f64)
827#endif
828
829// -------------------------------------------------------------------------
830// unpack
831// -------------------------------------------------------------------------
832
833// TODO: unpack is inefficient here since vzipq does both unpacklo and
834// TODO: unpackhi but only half of the result is used
835
836// via cast to larger datatype
837#define SIMDVEC_NEON_UNPACK(TYPE, BYTES, NEON_SUF, NEON_SUF2) \
838 template <size_t PART> \
839 static SIMD_INLINE Vec<TYPE, 16> unpack( \
840 const Vec<TYPE, 16> &a, const Vec<TYPE, 16> &b, Part<PART>, Bytes<BYTES>) \
841 { \
842 return vreinterpretq_##NEON_SUF##_##NEON_SUF2( \
843 (vzipq_##NEON_SUF2(vreinterpretq_##NEON_SUF2##_##NEON_SUF(a), \
844 vreinterpretq_##NEON_SUF2##_##NEON_SUF(b))) \
845 .val[PART]); \
846 }
847
848// via extraction of low or high halfs
849// (NOTE: PART and BYTES are needed in argument list)
850#define SIMDVEC_NEON_UNPACK_HALFS(TYPE, BYTES, NEON_SUF) \
851 static SIMD_INLINE Vec<TYPE, 16> unpack( \
852 const Vec<TYPE, 16> &a, const Vec<TYPE, 16> &b, Part<0>, Bytes<BYTES>) \
853 { \
854 return vcombine_##NEON_SUF(vget_low##_##NEON_SUF(a), \
855 vget_low##_##NEON_SUF(b)); \
856 } \
857 static SIMD_INLINE Vec<TYPE, 16> unpack( \
858 const Vec<TYPE, 16> &a, const Vec<TYPE, 16> &b, Part<1>, Bytes<BYTES>) \
859 { \
860 return vcombine_##NEON_SUF(vget_high##_##NEON_SUF(a), \
861 vget_high##_##NEON_SUF(b)); \
862 }
863
864SIMDVEC_NEON_UNPACK(Byte, 1, u8, u8)
865SIMDVEC_NEON_UNPACK(Byte, 2, u8, u16)
866SIMDVEC_NEON_UNPACK(Byte, 4, u8, u32)
867SIMDVEC_NEON_UNPACK_HALFS(Byte, 8, u8)
868SIMDVEC_NEON_UNPACK(SignedByte, 1, s8, s8)
869SIMDVEC_NEON_UNPACK(SignedByte, 2, s8, s16)
870SIMDVEC_NEON_UNPACK(SignedByte, 4, s8, s32)
871SIMDVEC_NEON_UNPACK_HALFS(SignedByte, 8, s8)
872SIMDVEC_NEON_UNPACK(Word, 2, u16, u16)
873SIMDVEC_NEON_UNPACK(Word, 4, u16, u32)
874SIMDVEC_NEON_UNPACK_HALFS(Word, 8, u16)
875SIMDVEC_NEON_UNPACK(Short, 2, s16, s16)
876SIMDVEC_NEON_UNPACK(Short, 4, s16, s32)
877SIMDVEC_NEON_UNPACK_HALFS(Short, 8, s16)
878SIMDVEC_NEON_UNPACK(Int, 4, s32, s32)
879SIMDVEC_NEON_UNPACK_HALFS(Int, 8, s32)
880SIMDVEC_NEON_UNPACK(Float, 4, f32, f32)
881SIMDVEC_NEON_UNPACK_HALFS(Float, 8, f32)
882
883#ifdef SIMD_64BIT_TYPES
884static SIMD_INLINE Vec<Long, 16> unpack(const Vec<Long, 16> &a,
885 const Vec<Long, 16> &b, Part<0>,
886 Bytes<8>)
887{
888 return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
889}
890static SIMD_INLINE Vec<Long, 16> unpack(const Vec<Long, 16> &a,
891 const Vec<Long, 16> &b, Part<1>,
892 Bytes<8>)
893{
894 return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
895}
896static SIMD_INLINE Vec<Double, 16> unpack(const Vec<Double, 16> &a,
897 const Vec<Double, 16> &b, Part<0>,
898 Bytes<8>)
899{
900 return vcombine_f64(vget_low_f64(a), vget_low_f64(b));
901}
902static SIMD_INLINE Vec<Double, 16> unpack(const Vec<Double, 16> &a,
903 const Vec<Double, 16> &b, Part<1>,
904 Bytes<8>)
905{
906 return vcombine_f64(vget_high_f64(a), vget_high_f64(b));
907}
908#endif
909
910#undef SIMDVEC_NEON_UNPACK
911#undef SIMDVEC_NEON_UNPACK_HALFS
912
913// ---------------------------------------------------------------------------
914// unpack16
915// ---------------------------------------------------------------------------
916
917// 16-byte-lane oriented unpack: for 16 bytes same as generalized unpack
918// unpack blocks of NUM_ELEMS elements of type T
919// PART=0: low half of input vectors,
920// PART=1: high half of input vectors
921template <size_t PART, size_t BYTES, typename T>
922static SIMD_INLINE Vec<T, 16> unpack16(const Vec<T, 16> &a, const Vec<T, 16> &b,
923 Part<PART>, Bytes<BYTES>)
924{
925 return unpack(a, b, Part<PART>(), Bytes<BYTES>());
926}
927
928// ---------------------------------------------------------------------------
929// extract 128-bit lane as Vec<T, 16>, does nothing for 16 bytes
930// ---------------------------------------------------------------------------
931
932template <size_t LANE_INDEX, typename T>
933static SIMD_INLINE Vec<T, 16> extractLane(const Vec<T, 16> &a)
934{
935 return a;
936}
937
938// -------------------------------------------------------------------------
939// zip
940// -------------------------------------------------------------------------
941
942// a, b passed by-value to avoid problems with identical input/output args.
943
944// via cast to larger datatype
945#define SIMDVEC_NEON_ZIP(TYPE, NUM_ELEMS, NEON_SUF, NEON_SUF2, NEONX2_2) \
946 static SIMD_INLINE void zip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
947 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
948 Elements<NUM_ELEMS>) \
949 { \
950 NEONX2_2 res; \
951 res = vzipq_##NEON_SUF2(vreinterpretq_##NEON_SUF2##_##NEON_SUF(a), \
952 vreinterpretq_##NEON_SUF2##_##NEON_SUF(b)); \
953 c = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[0]); \
954 d = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[1]); \
955 }
956
957// via extraction of low or high halfs
958// (NOTE: NUM_ELEMS is needed in argument list)
959#define SIMDVEC_NEON_ZIP_HALFS(TYPE, NUM_ELEMS, NEON_SUF) \
960 static SIMD_INLINE void zip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
961 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
962 Elements<NUM_ELEMS>) \
963 { \
964 c = vcombine_##NEON_SUF(vget_low_##NEON_SUF(a), vget_low_##NEON_SUF(b)); \
965 d = vcombine_##NEON_SUF(vget_high_##NEON_SUF(a), vget_high_##NEON_SUF(b)); \
966 }
967
968SIMDVEC_NEON_ZIP(Byte, 1, u8, u8, uint8x16x2_t)
969SIMDVEC_NEON_ZIP(Byte, 2, u8, u16, uint16x8x2_t)
970SIMDVEC_NEON_ZIP(Byte, 4, u8, u32, uint32x4x2_t)
971SIMDVEC_NEON_ZIP_HALFS(Byte, 8, u8)
972SIMDVEC_NEON_ZIP(SignedByte, 1, s8, s8, int8x16x2_t)
973SIMDVEC_NEON_ZIP(SignedByte, 2, s8, s16, int16x8x2_t)
974SIMDVEC_NEON_ZIP(SignedByte, 4, s8, s32, int32x4x2_t)
975SIMDVEC_NEON_ZIP_HALFS(SignedByte, 8, s8)
976SIMDVEC_NEON_ZIP(Word, 1, u16, u16, uint16x8x2_t)
977SIMDVEC_NEON_ZIP(Word, 2, u16, u32, uint32x4x2_t)
978SIMDVEC_NEON_ZIP_HALFS(Word, 4, u16)
979SIMDVEC_NEON_ZIP(Short, 1, s16, s16, int16x8x2_t)
980SIMDVEC_NEON_ZIP(Short, 2, s16, s32, int32x4x2_t)
981SIMDVEC_NEON_ZIP_HALFS(Short, 4, s16)
982SIMDVEC_NEON_ZIP(Int, 1, s32, s32, int32x4x2_t)
983SIMDVEC_NEON_ZIP_HALFS(Int, 2, s32)
984SIMDVEC_NEON_ZIP(Float, 1, f32, f32, float32x4x2_t)
985SIMDVEC_NEON_ZIP_HALFS(Float, 2, f32)
986
987#ifdef SIMD_64BIT_TYPES
988static SIMD_INLINE void zip(const Vec<Long, 16> a, const Vec<Long, 16> b,
989 Vec<Long, 16> &c, Vec<Long, 16> &d, Elements<1>)
990{
991 c = vcombine_s64(vget_low_s64(a), vget_low_s64(b));
992 d = vcombine_s64(vget_high_s64(a), vget_high_s64(b));
993}
994static SIMD_INLINE void zip(const Vec<Double, 16> a, const Vec<Double, 16> b,
995 Vec<Double, 16> &c, Vec<Double, 16> &d, Elements<1>)
996{
997 c = vcombine_f64(vget_low_f64(a), vget_low_f64(b));
998 d = vcombine_f64(vget_high_f64(a), vget_high_f64(b));
999}
1000#endif
1001
1002template <size_t NUM_ELEMS, typename T>
1003static SIMD_INLINE void zip(const Vec<T, 16> a, const Vec<T, 16> b,
1004 Vec<T, 16> &c, Vec<T, 16> &d)
1005{
1006 return zip(a, b, c, d, Elements<NUM_ELEMS>());
1007}
1008
1009#undef SIMDVEC_NEON_ZIP
1010#undef SIMDVEC_NEON_ZIP_HALFS
1011
1012// ---------------------------------------------------------------------------
1013// zip16 hub (16-byte-lane oriented zip): for 16 bytes same as zip
1014// ---------------------------------------------------------------------------
1015
1016// a, b are passed by-value to avoid problems with identical input/output args.
1017
1018template <size_t NUM_ELEMS, typename T>
1019static SIMD_INLINE void zip16(const Vec<T, 16> a, const Vec<T, 16> b,
1020 Vec<T, 16> &l, Vec<T, 16> &h)
1021{
1022 zip<NUM_ELEMS, T>(a, b, l, h);
1023}
1024
1025// -------------------------------------------------------------------------
1026// unzip
1027// -------------------------------------------------------------------------
1028
1029// -------------------------------------------------------------------------
1030// unzip
1031// -------------------------------------------------------------------------
1032
1033// a, b passed by-value to avoid problems with identical input/output args.
1034
1035// via cast to larger datatype
1036#define SIMDVEC_NEON_UNZIP(TYPE, BYTES, NEON_SUF, NEON_SUF2, NEONX2_2) \
1037 static SIMD_INLINE void unzip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
1038 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
1039 Bytes<BYTES>) \
1040 { \
1041 NEONX2_2 res; \
1042 res = vuzpq_##NEON_SUF2(vreinterpretq_##NEON_SUF2##_##NEON_SUF(a), \
1043 vreinterpretq_##NEON_SUF2##_##NEON_SUF(b)); \
1044 c = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[0]); \
1045 d = vreinterpretq_##NEON_SUF##_##NEON_SUF2(res.val[1]); \
1046 }
1047
1048// via extraction of low or high halfs
1049// (NOTE: BYTES is needed in argument list)
1050#define SIMDVEC_NEON_UNZIP_HALFS(TYPE, BYTES, NEON_SUF) \
1051 static SIMD_INLINE void unzip(const Vec<TYPE, 16> a, const Vec<TYPE, 16> b, \
1052 Vec<TYPE, 16> &c, Vec<TYPE, 16> &d, \
1053 Bytes<BYTES>) \
1054 { \
1055 c = vcombine_##NEON_SUF(vget_low_##NEON_SUF(a), vget_low_##NEON_SUF(b)); \
1056 d = vcombine_##NEON_SUF(vget_high_##NEON_SUF(a), vget_high_##NEON_SUF(b)); \
1057 }
1058
1059SIMDVEC_NEON_UNZIP(Byte, 1, u8, u8, uint8x16x2_t)
1060SIMDVEC_NEON_UNZIP(Byte, 2, u8, u16, uint16x8x2_t)
1061SIMDVEC_NEON_UNZIP(Byte, 4, u8, u32, uint32x4x2_t)
1062SIMDVEC_NEON_UNZIP_HALFS(Byte, 8, u8)
1063
1064SIMDVEC_NEON_UNZIP(SignedByte, 1, s8, s8, int8x16x2_t)
1065SIMDVEC_NEON_UNZIP(SignedByte, 2, s8, s16, int16x8x2_t)
1066SIMDVEC_NEON_UNZIP(SignedByte, 4, s8, s32, int32x4x2_t)
1067SIMDVEC_NEON_UNZIP_HALFS(SignedByte, 8, s8)
1068
1069SIMDVEC_NEON_UNZIP(Word, 2, u16, u16, uint16x8x2_t)
1070SIMDVEC_NEON_UNZIP(Word, 4, u16, u32, uint32x4x2_t)
1071SIMDVEC_NEON_UNZIP_HALFS(Word, 8, u16)
1072
1073SIMDVEC_NEON_UNZIP(Short, 2, s16, s16, int16x8x2_t)
1074SIMDVEC_NEON_UNZIP(Short, 4, s16, s32, int32x4x2_t)
1075SIMDVEC_NEON_UNZIP_HALFS(Short, 8, s16)
1076
1077SIMDVEC_NEON_UNZIP(Int, 4, s32, s32, int32x4x2_t)
1078SIMDVEC_NEON_UNZIP_HALFS(Int, 8, s32)
1079
1080SIMDVEC_NEON_UNZIP(Float, 4, f32, f32, float32x4x2_t)
1081SIMDVEC_NEON_UNZIP_HALFS(Float, 8, f32)
1082
1083#ifdef SIMD_64BIT_TYPES
1084static SIMD_INLINE void unzip(const Vec<Long, 16> a, const Vec<Long, 16> b,
1085 Vec<Long, 16> &c, Vec<Long, 16> &d, Bytes<8>)
1086{
1087 c = vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1088 d = vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1089}
1090static SIMD_INLINE void unzip(const Vec<Double, 16> a, const Vec<Double, 16> b,
1091 Vec<Double, 16> &c, Vec<Double, 16> &d, Bytes<8>)
1092{
1093 c = vcombine_f64(vget_low_f64(a), vget_low_f64(b));
1094 d = vcombine_f64(vget_high_f64(a), vget_high_f64(b));
1095}
1096#endif
1097
1098#undef SIMDVEC_NEON_UNZIP
1099#undef SIMDVEC_NEON_UNZIP_HALFS
1100
1101// ---------------------------------------------------------------------------
1102// packs
1103// ---------------------------------------------------------------------------
1104
1105// signed -> signed
1106
1107static SIMD_INLINE Vec<SignedByte, 16> packs(const Vec<Short, 16> &a,
1108 const Vec<Short, 16> &b,
1109 OutputType<SignedByte>)
1110{
1111 return vcombine_s8(vqmovn_s16(a), vqmovn_s16(b));
1112}
1113
1114static SIMD_INLINE Vec<Short, 16> packs(const Vec<Int, 16> &a,
1115 const Vec<Int, 16> &b,
1116 OutputType<Short>)
1117{
1118 return vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
1119}
1120
1121static SIMD_INLINE Vec<Short, 16> packs(const Vec<Float, 16> &a,
1122 const Vec<Float, 16> &b,
1123 OutputType<Short>)
1124{
1125 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
1126 OutputType<Short>());
1127}
1128
1129#ifdef SIMD_64BIT_TYPES
1130static SIMD_INLINE Vec<Int, 16> packs(const Vec<Long, 16> &a,
1131 const Vec<Long, 16> &b, OutputType<Int>)
1132{
1133 return vcombine_s32(vqmovn_s64(a), vqmovn_s64(b));
1134}
1135
1136static SIMD_INLINE Vec<Int, 16> packs(const Vec<Double, 16> &a,
1137 const Vec<Double, 16> &b, OutputType<Int>)
1138{
1139 return vcombine_s32(vqmovn_s64(vcvtq_s64_f64(a)),
1140 vqmovn_s64(vcvtq_s64_f64(b)));
1141}
1142
1143static SIMD_INLINE Vec<Float, 16> packs(const Vec<Long, 16> &a,
1144 const Vec<Long, 16> &b,
1145 OutputType<Float>)
1146{
1147 return vcombine_f32(vcvt_f32_f64(vcvtq_f64_s64(a)),
1148 vcvt_f32_f64(vcvtq_f64_s64(b)));
1149}
1150
1151static SIMD_INLINE Vec<Float, 16> packs(const Vec<Double, 16> &a,
1152 const Vec<Double, 16> &b,
1153 OutputType<Float>)
1154{
1155 return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b));
1156}
1157#endif
1158
1159// unsigned -> unsigned
1160
1161static SIMD_INLINE Vec<Byte, 16> packs(const Vec<Word, 16> &a,
1162 const Vec<Word, 16> &b, OutputType<Byte>)
1163{
1164 return vcombine_u8(vqmovn_u16(a), vqmovn_u16(b));
1165}
1166
1167// signed -> unsigned
1168
1169static SIMD_INLINE Vec<Byte, 16> packs(const Vec<Short, 16> &a,
1170 const Vec<Short, 16> &b,
1171 OutputType<Byte>)
1172{
1173 return vcombine_u8(vqmovun_s16(a), vqmovun_s16(b));
1174}
1175
1176static SIMD_INLINE Vec<Word, 16> packs(const Vec<Int, 16> &a,
1177 const Vec<Int, 16> &b, OutputType<Word>)
1178{
1179 return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
1180}
1181
1182static SIMD_INLINE Vec<Word, 16> packs(const Vec<Float, 16> &a,
1183 const Vec<Float, 16> &b,
1184 OutputType<Word>)
1185{
1186 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
1187 OutputType<Word>());
1188}
1189
1190// unsigned -> signed
1191
1192static SIMD_INLINE Vec<SignedByte, 16> packs(const Vec<Word, 16> &a,
1193 const Vec<Word, 16> &b,
1194 OutputType<SignedByte>)
1195{
1196 return vcombine_s8(
1197 vreinterpret_s8_u8(vmin_u8(vqmovn_u16(a), vdup_n_u8(0x7f))),
1198 vreinterpret_s8_u8(vmin_u8(vqmovn_u16(b), vdup_n_u8(0x7f))));
1199}
1200
1201// -------------------------------------------------------------------------
1202// generalized extend: no stage
1203// -------------------------------------------------------------------------
1204
1205// combinations:
1206// - signed -> extended signed (sign extension)
1207// - unsigned -> extended unsigned (zero extension)
1208// - unsigned -> extended signed (zero extension)
1209// - signed -> extended unsigned (saturation and zero extension)
1210
1211// some types
1212template <typename T>
1213static SIMD_INLINE void extend(const Vec<T, 16> &vIn, Vec<T, 16> vOut[1])
1214{
1215 vOut[0] = vIn;
1216}
1217
1218// same size, different type
1219
1220static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1221 Vec<Byte, 16> vOut[1])
1222{
1223 vOut[0] = vreinterpretq_u8_s8(vmaxq_s8(vIn, vdupq_n_s8(0)));
1224}
1225
1226static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn,
1227 Vec<SignedByte, 16> vOut[1])
1228{
1229 vOut[0] = vreinterpretq_s8_u8(vminq_u8(vIn, vdupq_n_u8(0x7f)));
1230}
1231
1232static SIMD_INLINE void extend(const Vec<Short, 16> &vIn, Vec<Word, 16> vOut[1])
1233{
1234 vOut[0] = vreinterpretq_u16_s16(vmaxq_s16(vIn, vdupq_n_s16(0)));
1235}
1236
1237static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Short, 16> vOut[1])
1238{
1239 vOut[0] = vreinterpretq_s16_u16(vminq_u16(vIn, vdupq_n_u16(0x7fff)));
1240}
1241
1242// -------------------------------------------------------------------------
1243// generalized extend: single stage
1244// -------------------------------------------------------------------------
1245
1246// signed -> signed
1247
1248static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1249 Vec<Short, 16> vOut[2])
1250{
1251 vOut[0] = vmovl_s8(vget_low_s8(vIn));
1252 vOut[1] = vmovl_s8(vget_high_s8(vIn));
1253}
1254
1255static SIMD_INLINE void extend(const Vec<Short, 16> &vIn, Vec<Int, 16> vOut[2])
1256{
1257 vOut[0] = vmovl_s16(vget_low_s16(vIn));
1258 vOut[1] = vmovl_s16(vget_high_s16(vIn));
1259}
1260
1261static SIMD_INLINE void extend(const Vec<Short, 16> &vIn,
1262 Vec<Float, 16> vOut[2])
1263{
1264 vOut[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vIn)));
1265 vOut[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vIn)));
1266}
1267
1268#ifdef SIMD_64BIT_TYPES
1269static SIMD_INLINE void extend(const Vec<Int, 16> &vIn, Vec<Long, 16> vOut[2])
1270{
1271 vOut[0] = vmovl_s32(vget_low_s32(vIn));
1272 vOut[1] = vmovl_s32(vget_high_s32(vIn));
1273}
1274
1275static SIMD_INLINE void extend(const Vec<Int, 16> &vIn, Vec<Double, 16> vOut[2])
1276{
1277 vOut[0] = vcvtq_f64_s64(vmovl_s32(vget_low_s32(vIn)));
1278 vOut[1] = vcvtq_f64_s64(vmovl_s32(vget_high_s32(vIn)));
1279}
1280
1281static SIMD_INLINE void extend(const Vec<Float, 16> &vIn, Vec<Long, 16> vOut[2])
1282{
1283 vOut[0] = vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(vIn)));
1284 vOut[1] = vcvtq_s64_f64(vcvt_f64_f32(vget_high_f32(vIn)));
1285}
1286
1287static SIMD_INLINE void extend(const Vec<Float, 16> &vIn,
1288 Vec<Double, 16> vOut[2])
1289{
1290 vOut[0] = vcvt_f64_f32(vget_low_f32(vIn));
1291 vOut[1] = vcvt_f64_f32(vget_high_f32(vIn));
1292}
1293#endif
1294
1295// unsigned -> unsigned
1296
1297static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Word, 16> vOut[2])
1298{
1299 vOut[0] = vmovl_u8(vget_low_u8(vIn));
1300 vOut[1] = vmovl_u8(vget_high_u8(vIn));
1301}
1302
1303// unsigned -> signed
1304
1305static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Short, 16> vOut[2])
1306{
1307 vOut[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vIn)));
1308 vOut[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vIn)));
1309}
1310
1311static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Int, 16> vOut[2])
1312{
1313 vOut[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vIn)));
1314 vOut[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vIn)));
1315}
1316
1317static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Float, 16> vOut[2])
1318{
1319 vOut[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vIn)));
1320 vOut[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vIn)));
1321}
1322
1323// signed -> unsigned
1324
1325static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1326 Vec<Word, 16> vOut[2])
1327{
1328 const auto saturated = vmaxq_s8(vIn, vdupq_n_s8(0));
1329 vOut[0] = vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(saturated)));
1330 vOut[1] = vmovl_u8(vget_high_u8(vreinterpretq_u8_s8(saturated)));
1331}
1332
1333// -------------------------------------------------------------------------
1334// generalized extend: two stages
1335// -------------------------------------------------------------------------
1336
1337// signed -> signed
1338
1339static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1340 Vec<Int, 16> vOut[4])
1341{
1342 Vec<Short, 16> vShort[2];
1343 extend(vIn, vShort);
1344 extend(vShort[0], vOut);
1345 extend(vShort[1], vOut + 2);
1346}
1347
1348static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1349 Vec<Float, 16> vOut[4])
1350{
1351 Vec<Short, 16> vShort[2];
1352 extend(vIn, vShort);
1353 extend(vShort[0], vOut);
1354 extend(vShort[1], vOut + 2);
1355}
1356
1357#ifdef SIMD_64BIT_TYPES
1358static SIMD_INLINE void extend(const Vec<Short, 16> &vIn, Vec<Long, 16> vOut[4])
1359{
1360 Vec<Int, 16> vInt[2];
1361 extend(vIn, vInt);
1362 extend(vInt[0], vOut);
1363 extend(vInt[1], vOut + 2);
1364}
1365
1366static SIMD_INLINE void extend(const Vec<Short, 16> &vIn,
1367 Vec<Double, 16> vOut[4])
1368{
1369 Vec<Int, 16> vInt[2];
1370 extend(vIn, vInt);
1371 extend(vInt[0], vOut);
1372 extend(vInt[1], vOut + 2);
1373}
1374#endif
1375
1376// unsigned -> signed
1377
1378static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Int, 16> vOut[4])
1379{
1380 Vec<Short, 16> vShort[2];
1381 extend(vIn, vShort);
1382 extend(vShort[0], vOut);
1383 extend(vShort[1], vOut + 2);
1384}
1385
1386static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Float, 16> vOut[4])
1387{
1388 Vec<Short, 16> vShort[2];
1389 extend(vIn, vShort);
1390 extend(vShort[0], vOut);
1391 extend(vShort[1], vOut + 2);
1392}
1393
1394#ifdef SIMD_64BIT_TYPES
1395static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Long, 16> vOut[4])
1396{
1397 Vec<Int, 16> vInt[2];
1398 extend(vIn, vInt);
1399 extend(vInt[0], vOut);
1400 extend(vInt[1], vOut + 2);
1401}
1402
1403static SIMD_INLINE void extend(const Vec<Word, 16> &vIn,
1404 Vec<Double, 16> vOut[4])
1405{
1406 Vec<Int, 16> vInt[2];
1407 extend(vIn, vInt);
1408 extend(vInt[0], vOut);
1409 extend(vInt[1], vOut + 2);
1410}
1411#endif
1412
1413// -------------------------------------------------------------------------
1414// generalized extend: three stages
1415// -------------------------------------------------------------------------
1416
1417// signed -> signed
1418
1419#ifdef SIMD_64BIT_TYPES
1420static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1421 Vec<Long, 16> vOut[8])
1422{
1423 Vec<Int, 16> vInt[4];
1424 extend(vIn, vInt);
1425 extend(vInt[0], vOut);
1426 extend(vInt[1], vOut + 2);
1427 extend(vInt[2], vOut + 4);
1428 extend(vInt[3], vOut + 6);
1429}
1430
1431static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
1432 Vec<Double, 16> vOut[8])
1433{
1434 Vec<Int, 16> vInt[4];
1435 extend(vIn, vInt);
1436 extend(vInt[0], vOut);
1437 extend(vInt[1], vOut + 2);
1438 extend(vInt[2], vOut + 4);
1439 extend(vInt[3], vOut + 6);
1440}
1441#endif
1442
1443// unsigned -> signed
1444
1445#ifdef SIMD_64BIT_TYPES
1446static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Long, 16> vOut[8])
1447{
1448 Vec<Int, 16> vInt[4];
1449 extend(vIn, vInt);
1450 extend(vInt[0], vOut);
1451 extend(vInt[1], vOut + 2);
1452 extend(vInt[2], vOut + 4);
1453 extend(vInt[3], vOut + 6);
1454}
1455
1456static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn,
1457 Vec<Double, 16> vOut[8])
1458{
1459 Vec<Int, 16> vInt[4];
1460 extend(vIn, vInt);
1461 extend(vInt[0], vOut);
1462 extend(vInt[1], vOut + 2);
1463 extend(vInt[2], vOut + 4);
1464 extend(vInt[3], vOut + 6);
1465}
1466#endif
1467
1468// -------------------------------------------------------------------------
1469// generalized extend: special case int <-> float, long <-> double
1470// -------------------------------------------------------------------------
1471
1472static SIMD_INLINE void extend(const Vec<Int, 16> &vIn, Vec<Float, 16> vOut[1])
1473{
1474 vOut[0] = cvts(vIn, OutputType<Float>());
1475}
1476
1477static SIMD_INLINE void extend(const Vec<Float, 16> &vIn, Vec<Int, 16> vOut[1])
1478{
1479 vOut[0] = cvts(vIn, OutputType<Int>());
1480}
1481
1482#ifdef SIMD_64BIT_TYPES
1483static SIMD_INLINE void extend(const Vec<Long, 16> &vIn,
1484 Vec<Double, 16> vOut[1])
1485{
1486 vOut[0] = cvts(vIn, OutputType<Double>());
1487}
1488
1489static SIMD_INLINE void extend(const Vec<Double, 16> &vIn,
1490 Vec<Long, 16> vOut[1])
1491{
1492 vOut[0] = cvts(vIn, OutputType<Long>());
1493}
1494#endif
1495
1496// -------------------------------------------------------------------------
1497// shift functions
1498// -------------------------------------------------------------------------
1499
1500// it was necessary to introduce a special case COUNT == 0, since this
1501// is not allowed for the shift intrinsics (just returns the
1502// argument); since the ARM docs aren't clear in this point, we also
1503// treat the case COUNT == no-of-bits as special case (in two
1504// versions: one using FCT on sizeof(TYPE)*8 - 1, the other setting result to
1505// zero)
1506
1507// is non-zero and in a range
1508template <bool nonZero, bool inRange>
1509struct IsNonZeroInRange
1510{};
1511
1512// is non-zero and in a given range
1513template <size_t RANGE, size_t INDEX>
1514struct IsNonZeroInGivenRange
1515 : public IsNonZeroInRange<(INDEX != 0), (INDEX < RANGE)>
1516{};
1517
1518#define SIMDVEC_NEON_SHIFT(FCT, TYPE, NEON_FCT, NEON_SUF) \
1519 template <size_t COUNT> \
1520 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1521 IsNonZeroInRange<true, true>) \
1522 { \
1523 return NEON_FCT##_##NEON_SUF(a, COUNT); \
1524 } \
1525 template <size_t COUNT> \
1526 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1527 IsNonZeroInRange<false, true>) \
1528 { \
1529 return a; \
1530 } \
1531 template <size_t COUNT> \
1532 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a) \
1533 { \
1534 return FCT<COUNT>(a, IsNonZeroInGivenRange<sizeof(TYPE) * 8, COUNT>()); \
1535 }
1536
1537// out-of-range implemented with FCT of sizeof(TYPE)*8 - 1
1538#define SIMDVEC_NEON_SHIFT_ARITH(FCT, TYPE, NEON_FCT, NEON_SUF) \
1539 template <size_t COUNT> \
1540 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1541 IsNonZeroInRange<true, false>) \
1542 { \
1543 return NEON_FCT##_##NEON_SUF(a, sizeof(TYPE) * 8 - 1); \
1544 } \
1545 SIMDVEC_NEON_SHIFT(FCT, TYPE, NEON_FCT, NEON_SUF)
1546
1547// out-of-range implemented with set-to-zero
1548#define SIMDVEC_NEON_SHIFT_LOGICAL(FCT, TYPE, NEON_FCT, NEON_SUF) \
1549 template <size_t COUNT> \
1550 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &, \
1551 IsNonZeroInRange<true, false>) \
1552 { \
1553 return vmovq_n_##NEON_SUF(TYPE(0)); \
1554 } \
1555 SIMDVEC_NEON_SHIFT(FCT, TYPE, NEON_FCT, NEON_SUF)
1556
1557#define SIMDVEC_NEON_SHIFT_REINTER(FCT, TYPE, NFCT, NSUF, NSUF2) \
1558 template <size_t COUNT> \
1559 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1560 IsNonZeroInRange<true, true>) \
1561 { \
1562 return vreinterpretq_##NSUF##_##NSUF2( \
1563 NFCT##_##NSUF2(vreinterpretq_##NSUF2##_##NSUF(a), COUNT)); \
1564 } \
1565 template <size_t COUNT> \
1566 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1567 IsNonZeroInRange<false, true>) \
1568 { \
1569 return a; \
1570 } \
1571 template <size_t COUNT> \
1572 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a) \
1573 { \
1574 return FCT<COUNT>(a, IsNonZeroInGivenRange<sizeof(TYPE) * 8, COUNT>()); \
1575 }
1576
1577#define SIMDVEC_NEON_SHIFT_REINTER_ARITH(FCT, TYPE, NFCT, NSUF, NSUF2) \
1578 template <size_t COUNT> \
1579 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &a, \
1580 IsNonZeroInRange<true, false>) \
1581 { \
1582 return vreinterpretq_##NSUF##_##NSUF2(NFCT##_##NSUF2( \
1583 vreinterpretq_##NSUF2##_##NSUF(a), sizeof(TYPE) * 8 - 1)); \
1584 } \
1585 SIMDVEC_NEON_SHIFT_REINTER(FCT, TYPE, NFCT, NSUF, NSUF2)
1586
1587#define SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(FCT, TYPE, NFCT, NSUF, NSUF2) \
1588 template <size_t COUNT> \
1589 static SIMD_INLINE Vec<TYPE, 16> FCT(const Vec<TYPE, 16> &, \
1590 IsNonZeroInRange<true, false>) \
1591 { \
1592 return vmovq_n_##NSUF(TYPE(0)); \
1593 } \
1594 SIMDVEC_NEON_SHIFT_REINTER(FCT, TYPE, NFCT, NSUF, NSUF2)
1595
1596// srai
1597
1598// requires cast of unsigned types to signed!
1599// http://stackoverflow.com/questions/18784988/neon-intrinsic-for-arithmetic-shift
1600// out-of-range case handled with FCT=srai
1601
1602// 13. Nov 22 (Jonas Keller):
1603// added missing Byte and SignedByte versions of srai
1604
1605SIMDVEC_NEON_SHIFT_REINTER_ARITH(srai, Byte, vshrq_n, u8, s8)
1606SIMDVEC_NEON_SHIFT_ARITH(srai, SignedByte, vshrq_n, s8)
1607SIMDVEC_NEON_SHIFT_REINTER_ARITH(srai, Word, vshrq_n, u16, s16)
1608SIMDVEC_NEON_SHIFT_ARITH(srai, Short, vshrq_n, s16)
1609SIMDVEC_NEON_SHIFT_ARITH(srai, Int, vshrq_n, s32)
1610#ifdef SIMD_64BIT_TYPES
1611SIMDVEC_NEON_SHIFT_ARITH(srai, Long, vshrq_n, s64)
1612#endif
1613
1614// srli
1615
1616// requires cast of signed types to unsigned!
1617// http://stackoverflow.com/questions/18784988/neon-intrinsic-for-arithmetic-shift
1618// out-of-range case handled with set-to-zero
1619
1620SIMDVEC_NEON_SHIFT_LOGICAL(srli, Byte, vshrq_n, u8)
1621SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, SignedByte, vshrq_n, s8, u8)
1622SIMDVEC_NEON_SHIFT_LOGICAL(srli, Word, vshrq_n, u16)
1623SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, Short, vshrq_n, s16, u16)
1624SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, Int, vshrq_n, s32, u32)
1625#ifdef SIMD_64BIT_TYPES
1626SIMDVEC_NEON_SHIFT_REINTER_LOGICAL(srli, Long, vshrq_n, s64, u64)
1627#endif
1628
1629// slli
1630
1631// out-of-range case handled with set-to-zero
1632SIMDVEC_NEON_SHIFT_LOGICAL(slli, Byte, vshlq_n, u8)
1633SIMDVEC_NEON_SHIFT_LOGICAL(slli, SignedByte, vshlq_n, s8)
1634SIMDVEC_NEON_SHIFT_LOGICAL(slli, Word, vshlq_n, u16)
1635SIMDVEC_NEON_SHIFT_LOGICAL(slli, Short, vshlq_n, s16)
1636SIMDVEC_NEON_SHIFT_LOGICAL(slli, Int, vshlq_n, s32)
1637#ifdef SIMD_64BIT_TYPES
1638SIMDVEC_NEON_SHIFT_LOGICAL(slli, Long, vshlq_n, s64)
1639#endif
1640
1641#undef SIMDVEC_NEON_SHIFT
1642#undef SIMDVEC_NEON_SHIFT_ARITH
1643#undef SIMDVEC_NEON_SHIFT_LOGICAL
1644#undef SIMDVEC_NEON_SHIFT_REINTER
1645#undef SIMDVEC_NEON_SHIFT_REINTER_ARITH
1646#undef SIMDVEC_NEON_SHIFT_REINTER_LOGICAL
1647
1648// 19. Dec 22 (Jonas Keller): added sra, srl and sll functions
1649
1650// -------------------------------------------------------------------------
1651// sra
1652// -------------------------------------------------------------------------
1653
1654static SIMD_INLINE Vec<Byte, 16> sra(const Vec<Byte, 16> &a,
1655 const uint8_t count)
1656{
1657 if (count == 0) {
1658 // TODO: is this necessary? what does vshlq do for count==0?
1659 return a;
1660 }
1661 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1662 return vreinterpretq_u8_s8(
1663 vshlq_s8(vreinterpretq_s8_u8(a), vdupq_n_s8(scount)));
1664}
1665
1666static SIMD_INLINE Vec<SignedByte, 16> sra(const Vec<SignedByte, 16> &a,
1667 const uint8_t count)
1668{
1669 if (count == 0) {
1670 // TODO: is this necessary? what does vshlq do for count==0?
1671 return a;
1672 }
1673 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1674 return vshlq_s8(a, vdupq_n_s8(scount));
1675}
1676
1677static SIMD_INLINE Vec<Word, 16> sra(const Vec<Word, 16> &a,
1678 const uint8_t count)
1679{
1680 if (count == 0) {
1681 // TODO: is this necessary? what does vshlq do for count==0?
1682 return a;
1683 }
1684 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1685 return vreinterpretq_u16_s16(
1686 vshlq_s16(vreinterpretq_s16_u16(a), vdupq_n_s16(scount)));
1687}
1688
1689static SIMD_INLINE Vec<Short, 16> sra(const Vec<Short, 16> &a,
1690 const uint8_t count)
1691{
1692 if (count == 0) {
1693 // TODO: is this necessary? what does vshlq do for count==0?
1694 return a;
1695 }
1696 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1697 return vshlq_s16(a, vdupq_n_s16(scount));
1698}
1699
1700static SIMD_INLINE Vec<Int, 16> sra(const Vec<Int, 16> &a, const uint8_t count)
1701{
1702 if (count == 0) {
1703 // TODO: is this necessary? what does vshlq do for count==0?
1704 return a;
1705 }
1706 int8_t scount = -((int8_t) std::min(count, uint8_t(32)));
1707 return vshlq_s32(a, vdupq_n_s32(scount));
1708}
1709
1710#ifdef SIMD_64BIT_TYPES
1711static SIMD_INLINE Vec<Long, 16> sra(const Vec<Long, 16> &a,
1712 const uint8_t count)
1713{
1714 if (count == 0) {
1715 // TODO: is this necessary? what does vshlq do for count==0?
1716 return a;
1717 }
1718 int8_t scount = -((int8_t) std::min(count, uint8_t(64)));
1719 return vshlq_s64(a, vdupq_n_s64(scount));
1720}
1721#endif
1722
1723// -------------------------------------------------------------------------
1724// srl
1725// -------------------------------------------------------------------------
1726
1727static SIMD_INLINE Vec<Byte, 16> srl(const Vec<Byte, 16> &a,
1728 const uint8_t count)
1729{
1730 if (count == 0) {
1731 // TODO: is this necessary? what does vshlq do for count==0?
1732 return a;
1733 }
1734 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1735 return vshlq_u8(a, vdupq_n_s8(scount));
1736}
1737
1738static SIMD_INLINE Vec<SignedByte, 16> srl(const Vec<SignedByte, 16> &a,
1739 const uint8_t count)
1740{
1741 if (count == 0) {
1742 // TODO: is this necessary? what does vshlq do for count==0?
1743 return a;
1744 }
1745 int8_t scount = -((int8_t) std::min(count, uint8_t(8)));
1746 return vreinterpretq_s8_u8(
1747 vshlq_u8(vreinterpretq_u8_s8(a), vdupq_n_s8(scount)));
1748}
1749
1750static SIMD_INLINE Vec<Word, 16> srl(const Vec<Word, 16> &a,
1751 const uint8_t count)
1752{
1753 if (count == 0) {
1754 // TODO: is this necessary? what does vshlq do for count==0?
1755 return a;
1756 }
1757 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1758 return vshlq_u16(a, vdupq_n_s16(scount));
1759}
1760
1761static SIMD_INLINE Vec<Short, 16> srl(const Vec<Short, 16> &a,
1762 const uint8_t count)
1763{
1764 if (count == 0) {
1765 // TODO: is this necessary? what does vshlq do for count==0?
1766 return a;
1767 }
1768 int8_t scount = -((int8_t) std::min(count, uint8_t(16)));
1769 return vreinterpretq_s16_u16(
1770 vshlq_u16(vreinterpretq_u16_s16(a), vdupq_n_s16(scount)));
1771}
1772
1773static SIMD_INLINE Vec<Int, 16> srl(const Vec<Int, 16> &a, const uint8_t count)
1774{
1775 if (count == 0) {
1776 // TODO: is this necessary? what does vshlq do for count==0?
1777 return a;
1778 }
1779 int8_t scount = -((int8_t) std::min(count, uint8_t(32)));
1780 return vreinterpretq_s32_u32(
1781 vshlq_u32(vreinterpretq_u32_s32(a), vdupq_n_s32(scount)));
1782}
1783
1784#ifdef SIMD_64BIT_TYPES
1785static SIMD_INLINE Vec<Long, 16> srl(const Vec<Long, 16> &a,
1786 const uint8_t count)
1787{
1788 if (count == 0) {
1789 // TODO: is this necessary? what does vshlq do for count==0?
1790 return a;
1791 }
1792 int8_t scount = -((int8_t) std::min(count, uint8_t(64)));
1793 return vreinterpretq_s64_u64(
1794 vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(scount)));
1795}
1796#endif
1797
1798// -------------------------------------------------------------------------
1799// sll
1800// -------------------------------------------------------------------------
1801
1802static SIMD_INLINE Vec<Byte, 16> sll(const Vec<Byte, 16> &a,
1803 const uint8_t count)
1804{
1805 if (count == 0) {
1806 // TODO: is this necessary? what does vshlq do for count==0?
1807 return a;
1808 }
1809 return vshlq_u8(a, vdupq_n_s8(std::min(count, uint8_t(8))));
1810}
1811
1812static SIMD_INLINE Vec<SignedByte, 16> sll(const Vec<SignedByte, 16> &a,
1813 const uint8_t count)
1814{
1815 if (count == 0) {
1816 // TODO: is this necessary? what does vshlq do for count==0?
1817 return a;
1818 }
1819 return vshlq_s8(a, vdupq_n_s8(std::min(count, uint8_t(8))));
1820}
1821
1822static SIMD_INLINE Vec<Word, 16> sll(const Vec<Word, 16> &a,
1823 const uint8_t count)
1824{
1825 if (count == 0) {
1826 // TODO: is this necessary? what does vshlq do for count==0?
1827 return a;
1828 }
1829 return vshlq_u16(a, vdupq_n_s16(std::min(count, uint8_t(16))));
1830}
1831
1832static SIMD_INLINE Vec<Short, 16> sll(const Vec<Short, 16> &a,
1833 const uint8_t count)
1834{
1835 if (count == 0) {
1836 // TODO: is this necessary? what does vshlq do for count==0?
1837 return a;
1838 }
1839 return vshlq_s16(a, vdupq_n_s16(std::min(count, uint8_t(16))));
1840}
1841
1842static SIMD_INLINE Vec<Int, 16> sll(const Vec<Int, 16> &a, const uint8_t count)
1843{
1844 if (count == 0) {
1845 // TODO: is this necessary? what does vshlq do for count==0?
1846 return a;
1847 }
1848 return vshlq_s32(a, vdupq_n_s32(std::min(count, uint8_t(32))));
1849}
1850
1851#ifdef SIMD_64BIT_TYPES
1852static SIMD_INLINE Vec<Long, 16> sll(const Vec<Long, 16> &a,
1853 const uint8_t count)
1854{
1855 if (count == 0) {
1856 // TODO: is this necessary? what does vshlq do for count==0?
1857 return a;
1858 }
1859 return vshlq_s64(a, vdupq_n_s64(std::min(count, uint8_t(64))));
1860}
1861#endif
1862
1863// 26. Sep 22 (Jonas Keller):
1864// added Byte and SignedByte versions of hadd, hadds, hsub and hsubs
1865// added Word version of hadds and hsubs
1866
1867// -------------------------------------------------------------------------
1868// hadd
1869// -------------------------------------------------------------------------
1870
1871#define SIMDVEC_NEON_HADD(TYPE, NEON_SUF) \
1872 static SIMD_INLINE Vec<TYPE, 16> hadd(const Vec<TYPE, 16> &a, \
1873 const Vec<TYPE, 16> &b) \
1874 { \
1875 return vcombine_##NEON_SUF( \
1876 vpadd_##NEON_SUF(vget_low_##NEON_SUF(a), vget_high_##NEON_SUF(a)), \
1877 vpadd_##NEON_SUF(vget_low_##NEON_SUF(b), vget_high_##NEON_SUF(b))); \
1878 }
1879
1880SIMDVEC_NEON_HADD(Byte, u8)
1881SIMDVEC_NEON_HADD(SignedByte, s8)
1882SIMDVEC_NEON_HADD(Word, u16)
1883SIMDVEC_NEON_HADD(Short, s16)
1884SIMDVEC_NEON_HADD(Int, s32)
1885SIMDVEC_NEON_HADD(Float, f32)
1886#ifdef SIMD_64BIT_TYPES
1887// vpadd_s64 does not exist, because int64x1_t is just a long, so we use the
1888// regular plus operator for long
1889static SIMD_INLINE Vec<Long, 16> hadd(const Vec<Long, 16> &a,
1890 const Vec<Long, 16> &b)
1891{
1892 return vcombine_s64(vget_low_s64(a) + vget_high_s64(a),
1893 vget_low_s64(b) + vget_high_s64(b));
1894}
1895// vpadd_f64 does not exist, because float64x1_t is just a double, so we use
1896// the regular plus operator for double
1897static SIMD_INLINE Vec<Double, 16> hadd(const Vec<Double, 16> &a,
1898 const Vec<Double, 16> &b)
1899{
1900 return vcombine_f64(vget_low_f64(a) + vget_high_f64(a),
1901 vget_low_f64(b) + vget_high_f64(b));
1902}
1903#endif
1904
1905#undef SIMDVEC_NEON_HADD
1906
1907// -------------------------------------------------------------------------
1908// hadds
1909// -------------------------------------------------------------------------
1910
1911template <typename T>
1912static SIMD_INLINE Vec<T, 16> hadds(const Vec<T, 16> &a, const Vec<T, 16> &b)
1913{
1914 Vec<T, 16> x, y;
1915 unzip(a, b, x, y, Bytes<sizeof(T)>());
1916 return adds(x, y);
1917}
1918
1919static SIMD_INLINE Vec<Short, 16> hadds(const Vec<Short, 16> &a,
1920 const Vec<Short, 16> &b)
1921{
1922 return vcombine_s16(vqmovn_s32(vpaddlq_s16(a)), vqmovn_s32(vpaddlq_s16(b)));
1923}
1924
1925static SIMD_INLINE Vec<Int, 16> hadds(const Vec<Int, 16> &a,
1926 const Vec<Int, 16> &b)
1927{
1928 return vcombine_s32(vqmovn_s64(vpaddlq_s32(a)), vqmovn_s64(vpaddlq_s32(b)));
1929}
1930
1931// Float not saturated
1932static SIMD_INLINE Vec<Float, 16> hadds(const Vec<Float, 16> &a,
1933 const Vec<Float, 16> &b)
1934{
1935 return hadd(a, b);
1936}
1937
1938#ifdef SIMD_64BIT_TYPES
1939// Double not saturated
1940static SIMD_INLINE Vec<Double, 16> hadds(const Vec<Double, 16> &a,
1941 const Vec<Double, 16> &b)
1942{
1943 return hadd(a, b);
1944}
1945#endif
1946
1947// -------------------------------------------------------------------------
1948// hsub
1949// -------------------------------------------------------------------------
1950
1951template <typename T>
1952static SIMD_INLINE Vec<T, 16> hsub(const Vec<T, 16> &a, const Vec<T, 16> &b)
1953{
1954 Vec<T, 16> x, y;
1955 unzip(a, b, x, y, Bytes<sizeof(T)>());
1956 return sub(x, y);
1957}
1958
1959#ifdef SIMD_64BIT_TYPES
1960static SIMD_INLINE Vec<Double, 16> hsub(const Vec<Double, 16> &a,
1961 const Vec<Double, 16> &b)
1962{
1963 return vcombine_f64(vget_low_f64(a) - vget_high_f64(a),
1964 vget_low_f64(b) - vget_high_f64(b));
1965}
1966#endif
1967
1968// -------------------------------------------------------------------------
1969// hsubs
1970// -------------------------------------------------------------------------
1971
1972template <typename T>
1973static SIMD_INLINE Vec<T, 16> hsubs(const Vec<T, 16> &a, const Vec<T, 16> &b)
1974{
1975 Vec<T, 16> x, y;
1976 unzip(a, b, x, y, Bytes<sizeof(T)>());
1977 return subs(x, y);
1978}
1979
1980#ifdef SIMD_64BIT_TYPES
1981// Double not saturated
1982static SIMD_INLINE Vec<Double, 16> hsubs(const Vec<Double, 16> &a,
1983 const Vec<Double, 16> &b)
1984{
1985 return vcombine_f64(vget_low_f64(a) - vget_high_f64(a),
1986 vget_low_f64(b) - vget_high_f64(b));
1987}
1988#endif
1989
1990// -------------------------------------------------------------------------
1991// alignre (moved above srle, slle)
1992// -------------------------------------------------------------------------
1993
1994#define SIMDVEC_NEON_ALIGNRE(TYPE, NEON_SUF) \
1995 template <size_t COUNT> \
1996 static SIMD_INLINE Vec<TYPE, 16> alignre( \
1997 const Vec<TYPE, 16> &, const Vec<TYPE, 16> &l, \
1998 Range<true, 0, Vec<TYPE, 16>::elements>) \
1999 { \
2000 return l; \
2001 } \
2002 template <size_t COUNT> \
2003 static SIMD_INLINE Vec<TYPE, 16> alignre( \
2004 const Vec<TYPE, 16> &h, const Vec<TYPE, 16> &l, \
2005 Range<false, 0, Vec<TYPE, 16>::elements>) \
2006 { \
2007 return vextq_##NEON_SUF(l, h, COUNT); \
2008 } \
2009 template <size_t COUNT> \
2010 static SIMD_INLINE Vec<TYPE, 16> alignre( \
2011 const Vec<TYPE, 16> &h, const Vec<TYPE, 16> &, \
2012 Range<true, Vec<TYPE, 16>::elements, 2 * Vec<TYPE, 16>::elements>) \
2013 { \
2014 return h; \
2015 } \
2016 template <size_t COUNT> \
2017 static SIMD_INLINE Vec<TYPE, 16> alignre( \
2018 const Vec<TYPE, 16> &h, const Vec<TYPE, 16> &, \
2019 Range<false, Vec<TYPE, 16>::elements, 2 * Vec<TYPE, 16>::elements>) \
2020 { \
2021 return vextq_##NEON_SUF(h, vmovq_n_##NEON_SUF(TYPE(0)), \
2022 COUNT - Vec<TYPE, 16>::elements); \
2023 } \
2024 template <size_t COUNT, bool AT_LL, size_t LL_INCL, size_t UL_EXCL> \
2025 static SIMD_INLINE Vec<TYPE, 16> alignre(const Vec<TYPE, 16> &, \
2026 const Vec<TYPE, 16> &, \
2027 Range<AT_LL, LL_INCL, UL_EXCL>) \
2028 { \
2029 return vmovq_n_##NEON_SUF(TYPE(0)); \
2030 }
2031
2032SIMDVEC_NEON_ALIGNRE(Byte, u8)
2033SIMDVEC_NEON_ALIGNRE(SignedByte, s8)
2034SIMDVEC_NEON_ALIGNRE(Word, u16)
2035SIMDVEC_NEON_ALIGNRE(Short, s16)
2036SIMDVEC_NEON_ALIGNRE(Int, s32)
2037SIMDVEC_NEON_ALIGNRE(Float, f32)
2038#ifdef SIMD_64BIT_TYPES
2039SIMDVEC_NEON_ALIGNRE(Long, s64)
2040SIMDVEC_NEON_ALIGNRE(Double, f64)
2041#endif
2042
2043template <size_t COUNT, typename T>
2044static SIMD_INLINE Vec<T, 16> alignre(const Vec<T, 16> &h, const Vec<T, 16> &l)
2045{
2046 return alignre<COUNT>(h, l, SizeRange<COUNT, Vec<T, 16>::elements>());
2047}
2048
2049#undef SIMDVEC_NEON_ALIGNRE
2050
2051// -------------------------------------------------------------------------
2052// element-wise shift right
2053// -------------------------------------------------------------------------
2054
2055// all types, done via alignre
2056template <size_t COUNT, typename T>
2057static SIMD_INLINE Vec<T, 16> srle(const Vec<T, 16> &a)
2058{
2059 return alignre<COUNT>(setzero(OutputType<T>(), Integer<16>()), a);
2060}
2061
2062// -------------------------------------------------------------------------
2063// element-wise shift left
2064// -------------------------------------------------------------------------
2065
2066// all types, done via alignre
2067
2068template <size_t COUNT, typename T>
2069static SIMD_INLINE Vec<T, 16> slle(const Vec<T, 16> &a)
2070{
2071 SIMD_IF_CONSTEXPR (COUNT < Vec<T, 16>::elements) {
2072 return alignre<Vec<T, 16>::elements - COUNT>(
2073 a, setzero(OutputType<T>(), Integer<16>()));
2074 } else {
2075 return setzero(OutputType<T>(), Integer<16>());
2076 }
2077}
2078
2079// -------------------------------------------------------------------------
2080// swizzle
2081// -------------------------------------------------------------------------
2082
2083// swizzle tables
2084
2085static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<2>,
2086 Integer<1>)
2087{
2088 const uint8x8_t table[2] SIMD_ATTR_ALIGNED(16) = {
2089 {0, 2, 4, 6, 8, 10, 12, 14},
2090 {1, 3, 5, 7, 9, 11, 13, 15},
2091 };
2092 return table[index];
2093}
2094
2095static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<3>,
2096 Integer<1>)
2097{
2098 const uint8x8_t table[3] SIMD_ATTR_ALIGNED(16) = {
2099 {0, 3, 6, 9, 12, 15, 18, 21},
2100 {1, 4, 7, 10, 13, 16, 19, 22},
2101 {2, 5, 8, 11, 14, 17, 20, 23},
2102 };
2103 return table[index];
2104}
2105
2106static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<4>,
2107 Integer<1>)
2108{
2109 const uint8x8_t table[4] SIMD_ATTR_ALIGNED(16) = {
2110 {0, 4, 8, 12, 16, 20, 24, 28},
2111 {1, 5, 9, 13, 17, 21, 25, 29},
2112 {2, 6, 10, 14, 18, 22, 26, 30},
2113 {3, 7, 11, 15, 19, 23, 27, 31},
2114 };
2115 return table[index];
2116}
2117
2118static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<2>,
2119 Integer<2>)
2120{
2121 const uint8x8_t table[2] SIMD_ATTR_ALIGNED(16) = {
2122 {0, 1, 4, 5, 8, 9, 12, 13},
2123 {2, 3, 6, 7, 10, 11, 14, 15},
2124 };
2125 return table[index];
2126}
2127
2128static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<3>,
2129 Integer<2>)
2130{
2131 const uint8x8_t table[3] SIMD_ATTR_ALIGNED(16) = {
2132 {0, 1, 6, 7, 12, 13, 18, 19},
2133 {2, 3, 8, 9, 14, 15, 20, 21},
2134 {4, 5, 10, 11, 16, 17, 22, 23},
2135 };
2136 return table[index];
2137}
2138
2139static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<4>,
2140 Integer<2>)
2141{
2142 const uint8x8_t table[4] SIMD_ATTR_ALIGNED(16) = {
2143 {0, 1, 8, 9, 16, 17, 24, 25},
2144 {2, 3, 10, 11, 18, 19, 26, 27},
2145 {4, 5, 12, 13, 20, 21, 28, 29},
2146 {6, 7, 14, 15, 22, 23, 30, 31},
2147 };
2148 return table[index];
2149}
2150
2151static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<2>,
2152 Integer<4>)
2153{
2154 const uint8x8_t table[2] SIMD_ATTR_ALIGNED(16) = {
2155 {0, 1, 2, 3, 8, 9, 10, 11},
2156 {4, 5, 6, 7, 12, 13, 14, 15},
2157 };
2158 return table[index];
2159}
2160
2161static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<3>,
2162 Integer<4>)
2163{
2164 const uint8x8_t table[3] SIMD_ATTR_ALIGNED(16) = {
2165 {0, 1, 2, 3, 12, 13, 14, 15},
2166 {4, 5, 6, 7, 16, 17, 18, 19},
2167 {8, 9, 10, 11, 20, 21, 22, 23},
2168 };
2169 return table[index];
2170}
2171
2172static SIMD_INLINE uint8x8_t swizzleTable(const size_t index, Integer<4>,
2173 Integer<4>)
2174{
2175 const uint8x8_t table[4] SIMD_ATTR_ALIGNED(16) = {
2176 {0, 1, 2, 3, 16, 17, 18, 19},
2177 {4, 5, 6, 7, 20, 21, 22, 23},
2178 {8, 9, 10, 11, 24, 25, 26, 27},
2179 {12, 13, 14, 15, 28, 29, 30, 31},
2180 };
2181 return table[index];
2182}
2183
2184template <size_t N, typename T>
2185static SIMD_INLINE uint8x8_t swizzleTable(const size_t index)
2186{
2187 return swizzleTable(index, Integer<N>(), Integer<sizeof(T)>());
2188}
2189
2190template <typename T>
2191static SIMD_INLINE void swizzle(Vec<T, 16>[1], Integer<1>)
2192{
2193 // v remains unchanged
2194}
2195
2196template <typename T>
2197static SIMD_INLINE void swizzle(Vec<T, 16> v[2], Integer<2>)
2198{
2199 const Vec<Byte, 16> vByte[2] = {
2200 reinterpret(v[0], OutputType<Byte>()),
2201 reinterpret(v[1], OutputType<Byte>()),
2202 };
2203 for (size_t i = 0; i < 2; i++) {
2204 v[i] =
2205 reinterpret(Vec<Byte, 16>(vcombine_u8(
2206 vtbl2_u8({vget_low_u8(vByte[0]), vget_high_u8(vByte[0])},
2207 swizzleTable<2, T>(i)),
2208 vtbl2_u8({vget_low_u8(vByte[1]), vget_high_u8(vByte[1])},
2209 swizzleTable<2, T>(i)))),
2210 OutputType<T>());
2211 }
2212}
2213
2214#ifdef SIMD_64BIT_TYPES
2215static SIMD_INLINE void swizzle(Vec<Long, 16> v[2], Integer<2>)
2216{
2217 const Vec<Long, 16> tmp[2] = {v[0], v[1]};
2218 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_low_s64(tmp[1]));
2219 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_high_s64(tmp[1]));
2220}
2221
2222static SIMD_INLINE void swizzle(Vec<Double, 16> v[2], Integer<2>)
2223{
2224 const Vec<Double, 16> tmp[2] = {v[0], v[1]};
2225 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_low_f64(tmp[1]));
2226 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_high_f64(tmp[1]));
2227}
2228#endif
2229
2230template <typename T>
2231static SIMD_INLINE void swizzle(Vec<T, 16> v[3], Integer<3>)
2232{
2233 const Vec<Byte, 16> vByte[3] = {
2234 reinterpret(v[0], OutputType<Byte>()),
2235 reinterpret(v[1], OutputType<Byte>()),
2236 reinterpret(v[2], OutputType<Byte>()),
2237 };
2238 const uint8x8x3_t vu[2] = {
2239 {vget_low_u8(vByte[0]), vget_high_u8(vByte[0]), vget_low_u8(vByte[1])},
2240 {vget_high_u8(vByte[1]), vget_low_u8(vByte[2]), vget_high_u8(vByte[2])},
2241 };
2242 for (size_t i = 0; i < 3; i++) {
2243 v[i] = reinterpret(
2244 Vec<Byte, 16>(vcombine_u8(vtbl3_u8(vu[0], swizzleTable<3, T>(i)),
2245 vtbl3_u8(vu[1], swizzleTable<3, T>(i)))),
2246 OutputType<T>());
2247 }
2248}
2249
2250#ifdef SIMD_64BIT_TYPES
2251static SIMD_INLINE void swizzle(Vec<Long, 16> v[3], Integer<3>)
2252{
2253 const Vec<Long, 16> tmp[3] = {v[0], v[1], v[2]};
2254 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_high_s64(tmp[1]));
2255 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_low_s64(tmp[2]));
2256 v[2] = vcombine_s64(vget_low_s64(tmp[1]), vget_high_s64(tmp[2]));
2257}
2258
2259static SIMD_INLINE void swizzle(Vec<Double, 16> v[3], Integer<3>)
2260{
2261 const Vec<Double, 16> tmp[3] = {v[0], v[1], v[2]};
2262 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_high_f64(tmp[1]));
2263 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_low_f64(tmp[2]));
2264 v[2] = vcombine_f64(vget_low_f64(tmp[1]), vget_high_f64(tmp[2]));
2265}
2266#endif
2267
2268template <typename T>
2269static SIMD_INLINE void swizzle(Vec<T, 16> v[4], Integer<4>)
2270{
2271 const Vec<Byte, 16> vByte[4] = {
2272 reinterpret(v[0], OutputType<Byte>()),
2273 reinterpret(v[1], OutputType<Byte>()),
2274 reinterpret(v[2], OutputType<Byte>()),
2275 reinterpret(v[3], OutputType<Byte>()),
2276 };
2277 const uint8x8x4_t vu[2] = {
2278 {vget_low_u8(vByte[0]), vget_high_u8(vByte[0]), vget_low_u8(vByte[1]),
2279 vget_high_u8(vByte[1])},
2280 {vget_low_u8(vByte[2]), vget_high_u8(vByte[2]), vget_low_u8(vByte[3]),
2281 vget_high_u8(vByte[3])},
2282 };
2283 for (size_t i = 0; i < 4; i++) {
2284 v[i] = reinterpret(
2285 Vec<Byte, 16>(vcombine_u8(vtbl4_u8(vu[0], swizzleTable<4, T>(i)),
2286 vtbl4_u8(vu[1], swizzleTable<4, T>(i)))),
2287 OutputType<T>());
2288 }
2289}
2290
2291#ifdef SIMD_64BIT_TYPES
2292static SIMD_INLINE void swizzle(Vec<Long, 16> v[4], Integer<4>)
2293{
2294 const Vec<Long, 16> tmp[4] = {v[0], v[1], v[2], v[3]};
2295 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_low_s64(tmp[2]));
2296 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_high_s64(tmp[2]));
2297 v[2] = vcombine_s64(vget_low_s64(tmp[1]), vget_low_s64(tmp[3]));
2298 v[3] = vcombine_s64(vget_high_s64(tmp[1]), vget_high_s64(tmp[3]));
2299}
2300
2301static SIMD_INLINE void swizzle(Vec<Double, 16> v[4], Integer<4>)
2302{
2303 const Vec<Double, 16> tmp[4] = {v[0], v[1], v[2], v[3]};
2304 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_low_f64(tmp[2]));
2305 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_high_f64(tmp[2]));
2306 v[2] = vcombine_f64(vget_low_f64(tmp[1]), vget_low_f64(tmp[3]));
2307 v[3] = vcombine_f64(vget_high_f64(tmp[1]), vget_high_f64(tmp[3]));
2308}
2309#endif
2310
2311// ---------- n = 5 ----------
2312
2313// swizzle table
2314
2315// arrays are padded from 24 to 32 elements to keep alignment
2316static const uint8_t swizzleMask5Lo[5][32] SIMD_ATTR_ALIGNED(16) = {
2317 {},
2318 {0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17,
2319 3, 8, 13, 18, 4, 9, 14, 19, 99, 99, 99, 99},
2320 {0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15,
2321 6, 7, 16, 17, 8, 9, 18, 19, 99, 99, 99, 99},
2322 {},
2323 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2324 12, 13, 14, 15, 16, 17, 18, 19, 99, 99, 99, 99},
2325};
2326
2327// arrays are padded from 24 to 32 elements to keep alignment
2328static const uint8_t swizzleMask5Hi[5][32] SIMD_ATTR_ALIGNED(16) = {
2329 {},
2330 {4, 9, 14, 19, 5, 10, 15, 20, 6, 11, 16, 21,
2331 7, 12, 17, 22, 8, 13, 18, 23, 99, 99, 99, 99},
2332 {4, 5, 14, 15, 6, 7, 16, 17, 8, 9, 18, 19,
2333 10, 11, 20, 21, 12, 13, 22, 23, 99, 99, 99, 99},
2334 {},
2335 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2336 16, 17, 18, 19, 20, 21, 22, 23, 99, 99, 99, 99},
2337};
2338
2339// n = 5
2340template <size_t SIZE>
2341struct SwizzleTable5
2342{
2343 // two tables for n=3
2344 uint8x8x3_t table[2];
2345 SwizzleTable5()
2346 {
2347 for (size_t i = 0; i < 3; i++) {
2348 // first half (applied to vectors 0,1,2)
2349 table[0].val[i] = vld1_u8(&swizzleMask5Lo[SIZE][i * 8]);
2350 // second half (applied to vectors 2,3,4)
2351 table[1].val[i] = vld1_u8(&swizzleMask5Hi[SIZE][i * 8]);
2352 }
2353 }
2354};
2355
2356// n = 5
2357template <typename T>
2358static SIMD_INLINE void swizzle(Vec<T, 16> v[5], Integer<5>)
2359{
2360 // | v0l v0h | v1l v1h | v2l v2h | v3l v3h | v4l v4h |
2361 // i=0:
2362 // k: 0 1 2
2363 // j: 0 1 2
2364 // | vu0.0 v0.1 vu0.2|
2365 // i=1:
2366 // k: 2 3 4
2367 // j: 0 1 2
2368 // |vu1.0 vu1.1 vu1.2|
2369 // i=2:
2370 // k: 5 6 7
2371 // j: 0 1 2
2372 // |vu2.0 v2.1 vu2.2|
2373 // i=3:
2374 // k: 7 8 9
2375 // j: 0 1 2
2376 // |vu3.0 vu3.1 vu3.2 |
2377 //
2378 // n=0: n=1:
2379 // i=0: i=1: i=0: i=1:
2380 // k=0: k=1: k=2: k=3:
2381 // j=0:
2382 // | t.table[0].val[0]| | t.table[0].val[0]|
2383 // | t.table[1].val[0] | | t.table[1].val[0] |
2384 // j=1:
2385 // | t.table[0].val[1]| | t.table[0].val[1]|
2386 // | t.table[1].val[1] | | t.table[1].val[1] |
2387 // j=2:
2388 // | t.table[0].val[2]| | t.table[0].val[2]|
2389 // | t.table[1].val[2] | | t.table[1].val[2] |
2390
2391 uint8x8x3_t vu[4];
2392 // input half-vector index starts at k0
2393 const size_t k0[4] = {0, 2, 5, 7};
2394 for (size_t i = 0; i < 4; i++) {
2395 for (size_t j = 0; j < 3; j++) {
2396 const size_t k = k0[i] + j;
2397 const Vec<Byte, 16> vb = reinterpret(v[k >> 1], OutputType<Byte>());
2398 vu[i].val[j] = (k & 1) ? vget_high_u8(vb) : vget_low_u8(vb);
2399 }
2400 }
2401 static const SwizzleTable5<sizeof(T)> t;
2402 uint8x8_t r[2][3][3];
2403 // n: left/right half of input
2404 // k: index of vu
2405 for (size_t n = 0, k = 0; n < 2; n++)
2406 // i: left/right half of half input
2407 for (size_t i = 0; i < 2; i++, k++)
2408 // j: different 3-tables
2409 for (size_t j = 0; j < 3; j++)
2410 // apply table
2411 r[n][i][j] = vtbl3_u8(vu[k], t.table[i].val[j]);
2412 // zip 4-byte blocks together
2413 int32x2x2_t z[2][3];
2414 for (size_t n = 0; n < 2; n++)
2415 for (size_t j = 0; j < 3; j++)
2416 z[n][j] = vzip_s32(vreinterpret_s32_u8(r[n][0][j]),
2417 vreinterpret_s32_u8(r[n][1][j]));
2418 // combine left and right halfs
2419 for (size_t j = 0, k = 0; j < 3; j++) {
2420 for (size_t lh = 0; lh < 2; lh++) {
2421 v[k] = reinterpret(
2422 Vec<Int, 16>(vcombine_s32(z[0][j].val[lh], z[1][j].val[lh])),
2423 OutputType<T>());
2424 k++;
2425 if (k >= 5) break;
2426 }
2427 }
2428}
2429
2430#ifdef SIMD_64BIT_TYPES
2431static SIMD_INLINE void swizzle(Vec<Long, 16> v[5], Integer<5>)
2432{
2433 const Vec<Long, 16> tmp[5] = {v[0], v[1], v[2], v[3], v[4]};
2434 v[0] = vcombine_s64(vget_low_s64(tmp[0]), vget_high_s64(tmp[2]));
2435 v[1] = vcombine_s64(vget_high_s64(tmp[0]), vget_low_s64(tmp[3]));
2436 v[2] = vcombine_s64(vget_low_s64(tmp[1]), vget_high_s64(tmp[3]));
2437 v[3] = vcombine_s64(vget_high_s64(tmp[1]), vget_low_s64(tmp[4]));
2438 v[4] = vcombine_s64(vget_low_s64(tmp[2]), vget_high_s64(tmp[4]));
2439}
2440
2441static SIMD_INLINE void swizzle(Vec<Double, 16> v[5], Integer<5>)
2442{
2443 const Vec<Double, 16> tmp[5] = {v[0], v[1], v[2], v[3], v[4]};
2444 v[0] = vcombine_f64(vget_low_f64(tmp[0]), vget_high_f64(tmp[2]));
2445 v[1] = vcombine_f64(vget_high_f64(tmp[0]), vget_low_f64(tmp[3]));
2446 v[2] = vcombine_f64(vget_low_f64(tmp[1]), vget_high_f64(tmp[3]));
2447 v[3] = vcombine_f64(vget_high_f64(tmp[1]), vget_low_f64(tmp[4]));
2448 v[4] = vcombine_f64(vget_low_f64(tmp[2]), vget_high_f64(tmp[4]));
2449}
2450#endif
2451
2452// -------------------------------------------------------------------------
2453// compare functions
2454// -------------------------------------------------------------------------
2455
2456#define SIMDVEC_NEON_CMP(CMP, TYPE, NEON_SUF, NEON_USUF) \
2457 static SIMD_INLINE Vec<TYPE, 16> cmp##CMP(const Vec<TYPE, 16> &a, \
2458 const Vec<TYPE, 16> &b) \
2459 { \
2460 return vreinterpretq_##NEON_SUF##_##NEON_USUF( \
2461 vc##CMP##q##_##NEON_SUF(a, b)); \
2462 }
2463
2464#ifdef SIMD_64BIT_TYPES
2465#define SIMDVEC_NEON_CMP_ALL(CMP) \
2466 SIMDVEC_NEON_CMP(CMP, Byte, u8, u8) \
2467 SIMDVEC_NEON_CMP(CMP, SignedByte, s8, u8) \
2468 SIMDVEC_NEON_CMP(CMP, Word, u16, u16) \
2469 SIMDVEC_NEON_CMP(CMP, Short, s16, u16) \
2470 SIMDVEC_NEON_CMP(CMP, Int, s32, u32) \
2471 SIMDVEC_NEON_CMP(CMP, Long, s64, u64) \
2472 SIMDVEC_NEON_CMP(CMP, Float, f32, u32) \
2473 SIMDVEC_NEON_CMP(CMP, Double, f64, u64)
2474#else
2475#define SIMDVEC_NEON_CMP_ALL(CMP) \
2476 SIMDVEC_NEON_CMP(CMP, Byte, u8, u8) \
2477 SIMDVEC_NEON_CMP(CMP, SignedByte, s8, u8) \
2478 SIMDVEC_NEON_CMP(CMP, Word, u16, u16) \
2479 SIMDVEC_NEON_CMP(CMP, Short, s16, u16) \
2480 SIMDVEC_NEON_CMP(CMP, Int, s32, u32) \
2481 SIMDVEC_NEON_CMP(CMP, Float, f32, u32)
2482#endif
2483
2484SIMDVEC_NEON_CMP_ALL(lt)
2485SIMDVEC_NEON_CMP_ALL(le)
2486SIMDVEC_NEON_CMP_ALL(eq)
2487SIMDVEC_NEON_CMP_ALL(gt)
2488SIMDVEC_NEON_CMP_ALL(ge)
2489
2490#undef SIMDVEC_NEON_CMP_ALL
2491#undef SIMDVEC_NEON_CMP
2492
2493// -------------------------------------------------------------------------
2494// compare !=
2495// -------------------------------------------------------------------------
2496
2497#define SIMDVEC_NEON_CMPNEQ(TYPE, NEON_SUF, NEON_USUF) \
2498 static SIMD_INLINE Vec<TYPE, 16> cmpneq(const Vec<TYPE, 16> &a, \
2499 const Vec<TYPE, 16> &b) \
2500 { \
2501 return vreinterpretq_##NEON_SUF##_u32( \
2502 vmvnq_u32(vreinterpretq_u32_##NEON_USUF(vceqq_##NEON_SUF(a, b)))); \
2503 }
2504
2505SIMDVEC_NEON_CMPNEQ(Byte, u8, u8)
2506SIMDVEC_NEON_CMPNEQ(SignedByte, s8, u8)
2507SIMDVEC_NEON_CMPNEQ(Word, u16, u16)
2508SIMDVEC_NEON_CMPNEQ(Short, s16, u16)
2509SIMDVEC_NEON_CMPNEQ(Int, s32, u32)
2510SIMDVEC_NEON_CMPNEQ(Float, f32, u32)
2511#ifdef SIMD_64BIT_TYPES
2512SIMDVEC_NEON_CMPNEQ(Long, s64, u64)
2513SIMDVEC_NEON_CMPNEQ(Double, f64, u64)
2514#endif
2515
2516#undef SIMDVEC_NEON_CMPNEQ
2517
2518// -------------------------------------------------------------------------
2519// ifelse
2520// -------------------------------------------------------------------------
2521
2522// vbslq, unsigned mask
2523#define SIMDVEC_NEON_IFELSE(T, NEON_SUF, NEON_USUF) \
2524 static SIMD_INLINE Vec<T, 16> ifelse(const Vec<T, 16> &cond, \
2525 const Vec<T, 16> &trueVal, \
2526 const Vec<T, 16> &falseVal) \
2527 { \
2528 return vbslq_##NEON_SUF(vreinterpretq_##NEON_USUF##_##NEON_SUF(cond), \
2529 trueVal, falseVal); \
2530 }
2531
2532SIMDVEC_NEON_IFELSE(Byte, u8, u8)
2533SIMDVEC_NEON_IFELSE(SignedByte, s8, u8)
2534SIMDVEC_NEON_IFELSE(Word, u16, u16)
2535SIMDVEC_NEON_IFELSE(Short, s16, u16)
2536SIMDVEC_NEON_IFELSE(Int, s32, u32)
2537SIMDVEC_NEON_IFELSE(Float, f32, u32)
2538#ifdef SIMD_64BIT_TYPES
2539SIMDVEC_NEON_IFELSE(Long, s64, u64)
2540SIMDVEC_NEON_IFELSE(Double, f64, u64)
2541#endif
2542
2543#undef SIMDVEC_NEON_IFELSE
2544
2545// -------------------------------------------------------------------------
2546// bit_and
2547// -------------------------------------------------------------------------
2548
2549SIMDVEC_NEON_BINARY_ALLINT(bit_and, vandq)
2550
2551static SIMD_INLINE Vec<Float, 16> bit_and(const Vec<Float, 16> &a,
2552 const Vec<Float, 16> &b)
2553{
2554 return vreinterpretq_f32_s32(
2555 vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)));
2556}
2557
2558#ifdef SIMD_64BIT_TYPES
2559static SIMD_INLINE Vec<Double, 16> bit_and(const Vec<Double, 16> &a,
2560 const Vec<Double, 16> &b)
2561{
2562 return vreinterpretq_f64_s64(
2563 vandq_s64(vreinterpretq_s64_f64(a), vreinterpretq_s64_f64(b)));
2564}
2565#endif
2566
2567// -------------------------------------------------------------------------
2568// bit_or
2569// -------------------------------------------------------------------------
2570
2571SIMDVEC_NEON_BINARY_ALLINT(bit_or, vorrq)
2572
2573static SIMD_INLINE Vec<Float, 16> bit_or(const Vec<Float, 16> &a,
2574 const Vec<Float, 16> &b)
2575{
2576 return vreinterpretq_f32_s32(
2577 vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)));
2578}
2579
2580#ifdef SIMD_64BIT_TYPES
2581static SIMD_INLINE Vec<Double, 16> bit_or(const Vec<Double, 16> &a,
2582 const Vec<Double, 16> &b)
2583{
2584 return vreinterpretq_f64_s64(
2585 vorrq_s64(vreinterpretq_s64_f64(a), vreinterpretq_s64_f64(b)));
2586}
2587#endif
2588
2589// -------------------------------------------------------------------------
2590// bit_andnot
2591// -------------------------------------------------------------------------
2592
2593template <typename T>
2594static SIMD_INLINE Vec<T, 16> bit_andnot(const Vec<T, 16> &a,
2595 const Vec<T, 16> &b)
2596{
2597 return bit_and(bit_not(a), b);
2598}
2599
2600// -------------------------------------------------------------------------
2601// bit_xor
2602// -------------------------------------------------------------------------
2603
2604SIMDVEC_NEON_BINARY_ALLINT(bit_xor, veorq)
2605
2606static SIMD_INLINE Vec<Float, 16> bit_xor(const Vec<Float, 16> &a,
2607 const Vec<Float, 16> &b)
2608{
2609 return vreinterpretq_f32_s32(
2610 veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)));
2611}
2612
2613#ifdef SIMD_64BIT_TYPES
2614static SIMD_INLINE Vec<Double, 16> bit_xor(const Vec<Double, 16> &a,
2615 const Vec<Double, 16> &b)
2616{
2617 return vreinterpretq_f64_s64(
2618 veorq_s64(vreinterpretq_s64_f64(a), vreinterpretq_s64_f64(b)));
2619}
2620#endif
2621
2622// -------------------------------------------------------------------------
2623// bit_not
2624// -------------------------------------------------------------------------
2625
2626SIMDVEC_NEON_UNARY(bit_not, Byte, vmvnq, u8)
2627SIMDVEC_NEON_UNARY(bit_not, SignedByte, vmvnq, s8)
2628SIMDVEC_NEON_UNARY(bit_not, Word, vmvnq, u16)
2629SIMDVEC_NEON_UNARY(bit_not, Short, vmvnq, s16)
2630SIMDVEC_NEON_UNARY(bit_not, Int, vmvnq, s32)
2631
2632static SIMD_INLINE Vec<Float, 16> bit_not(const Vec<Float, 16> &a)
2633{
2634 return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a)));
2635}
2636
2637#ifdef SIMD_64BIT_TYPES
2638static SIMD_INLINE Vec<Long, 16> bit_not(const Vec<Long, 16> &a)
2639{
2640 return vreinterpretq_s64_u32(vmvnq_u32(vreinterpretq_u32_s64(a)));
2641}
2642static SIMD_INLINE Vec<Double, 16> bit_not(const Vec<Double, 16> &a)
2643{
2644 return vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a)));
2645}
2646#endif
2647
2648// -------------------------------------------------------------------------
2649// avg: average with rounding up
2650// -------------------------------------------------------------------------
2651
2652SIMDVEC_NEON_BINARY(avg, Byte, vrhaddq, u8)
2653SIMDVEC_NEON_BINARY(avg, SignedByte, vrhaddq, s8)
2654SIMDVEC_NEON_BINARY(avg, Word, vrhaddq, u16)
2655SIMDVEC_NEON_BINARY(avg, Short, vrhaddq, s16)
2656SIMDVEC_NEON_BINARY(avg, Int, vrhaddq, s32)
2657
2658static SIMD_INLINE Vec<Float, 16> avg(const Vec<Float, 16> &a,
2659 const Vec<Float, 16> &b)
2660{
2661 return vmulq_n_f32(vaddq_f32(a, b), 0.5f);
2662}
2663
2664#ifdef SIMD_64BIT_TYPES
2665static SIMD_INLINE Vec<Long, 16> avg(const Vec<Long, 16> &a,
2666 const Vec<Long, 16> &b)
2667{
2668 // vrhaddq_s64 does not exist
2669 // workaround from Hacker's Delight, 2-5 Average of Two Integers:
2670 // (a | b) - ((a ^ b) >> 1)
2671 return vsubq_s64(vorrq_s64(a, b), vshrq_n_s64(veorq_s64(a, b), 1));
2672}
2673static SIMD_INLINE Vec<Double, 16> avg(const Vec<Double, 16> &a,
2674 const Vec<Double, 16> &b)
2675{
2676 return vmulq_n_f64(vaddq_f64(a, b), 0.5);
2677}
2678#endif
2679
2680// -------------------------------------------------------------------------
2681// test_all_zeros
2682// -------------------------------------------------------------------------
2683
2684// from solution suggested by Henri Ylitie
2685// http://stackoverflow.com/questions/15389539/
2686// fastest-way-to-test-a-128-bit-neon-register-
2687// for-a-value-of-0-using-intrinsics
2688
2689static SIMD_INLINE float32x2_t vorr_f32(float32x2_t a, float32x2_t b)
2690{
2691 return vreinterpret_f32_s32(
2692 vorr_s32(vreinterpret_s32_f32(a), vreinterpret_s32_f32(b)));
2693}
2694
2695// vpmax has to operate on unsigned (u32), otherwise 0 could be
2696// the max. of a pair even though the other value is non-zero (neg.)
2697#define SIMDVEC_NEON_TESTALLZEROS(T, NEON_SUF) \
2698 static SIMD_INLINE bool test_all_zeros(const Vec<T, 16> &a) \
2699 { \
2700 uint32x4_t au = vreinterpretq_u32_##NEON_SUF(a); \
2701 uint32x2_t tmp = vorr_u32(vget_low_u32(au), vget_high_u32(au)); \
2702 return !vget_lane_u32(vpmax_u32(tmp, tmp), 0); \
2703 }
2704
2705SIMDVEC_NEON_TESTALLZEROS(Byte, u8)
2706SIMDVEC_NEON_TESTALLZEROS(SignedByte, s8)
2707SIMDVEC_NEON_TESTALLZEROS(Word, u16)
2708SIMDVEC_NEON_TESTALLZEROS(Short, s16)
2709SIMDVEC_NEON_TESTALLZEROS(Int, s32)
2710SIMDVEC_NEON_TESTALLZEROS(Float, f32)
2711#ifdef SIMD_64BIT_TYPES
2712SIMDVEC_NEON_TESTALLZEROS(Long, s64)
2713SIMDVEC_NEON_TESTALLZEROS(Double, f64)
2714#endif
2715
2716#undef SIMDVEC_NEON_TESTALLZEROS
2717
2718// -------------------------------------------------------------------------
2719// test_all_ones
2720// -------------------------------------------------------------------------
2721
2722template <typename T>
2723static SIMD_INLINE bool test_all_ones(const Vec<T, 16> &a)
2724{
2725 return test_all_zeros(bit_not(a));
2726}
2727
2728// -------------------------------------------------------------------------
2729// reverse
2730// -------------------------------------------------------------------------
2731
2732// https://stackoverflow.com/questions/18760784/reverse-vector-order-in-arm-neon-intrinsics
2733
2734#define SIMDVEC_NEON_REVERSE(T, NEON_SUF) \
2735 static SIMD_INLINE Vec<T, 16> reverse(const Vec<T, 16> &a) \
2736 { \
2737 const auto t = vrev64q_##NEON_SUF(a); \
2738 return vcombine_##NEON_SUF(vget_high_##NEON_SUF(t), \
2739 vget_low_##NEON_SUF(t)); \
2740 }
2741
2742SIMDVEC_NEON_REVERSE(Byte, u8)
2743SIMDVEC_NEON_REVERSE(SignedByte, s8)
2744SIMDVEC_NEON_REVERSE(Word, u16)
2745SIMDVEC_NEON_REVERSE(Short, s16)
2746SIMDVEC_NEON_REVERSE(Int, s32)
2747SIMDVEC_NEON_REVERSE(Float, f32)
2748#ifdef SIMD_64BIT_TYPES
2749static SIMD_INLINE Vec<Long, 16> reverse(const Vec<Long, 16> &a)
2750{
2751 return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
2752}
2753static SIMD_INLINE Vec<Double, 16> reverse(const Vec<Double, 16> &a)
2754{
2755 return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
2756}
2757#endif
2758
2759#undef SIMDVEC_NEON_REVERSE
2760
2761// ---------------------------------------------------------------------------
2762// msb2int
2763// ---------------------------------------------------------------------------
2764
2765// 17. Sep 22 (Jonas Keller): added msb2int functions
2766
2767static SIMD_INLINE uint64_t msb2int(const Vec<Byte, 16> &a)
2768{
2769 // from: https://stackoverflow.com/a/58381188/8461272
2770
2771 // Example input (half scale):
2772 // 0x89 FF 1D C0 00 10 99 33
2773
2774 // Shift out everything but the sign bits
2775 // 0x01 01 00 01 00 00 01 00
2776 uint8x16_t high_bits = vshrq_n_u8(a, 7);
2777
2778 // Merge the even lanes together with vsra. The '??' bytes are garbage.
2779 // vsri could also be used, but it is slightly slower on aarch64.
2780 // 0x??03 ??02 ??00 ??01
2781 uint16x8_t paired16 = vsraq_n_u16(vreinterpretq_u16_u8(high_bits),
2782 vreinterpretq_u16_u8(high_bits), 7);
2783 // Repeat with wider lanes.
2784 // 0x??????0B ??????04
2785 uint32x4_t paired32 = vsraq_n_u32(vreinterpretq_u32_u16(paired16),
2786 vreinterpretq_u32_u16(paired16), 14);
2787 // 0x??????????????4B
2788 uint64x2_t paired64 = vsraq_n_u64(vreinterpretq_u64_u32(paired32),
2789 vreinterpretq_u64_u32(paired32), 28);
2790 // Extract the low 8 bits from each lane and join.
2791 // 0x4B
2792 return vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 0) |
2793 ((int) vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 8) << 8);
2794}
2795
2796static SIMD_INLINE uint64_t msb2int(const Vec<SignedByte, 16> &a)
2797{
2798 // the same as msb2int(Vec<Byte,16>)
2799 return msb2int(reinterpret(a, OutputType<Byte>()));
2800}
2801
2802static SIMD_INLINE uint64_t msb2int(const Vec<Word, 16> &a)
2803{
2804 // analogous to msb2int(Vec<Byte,16>)
2805 // idea from: https://stackoverflow.com/a/58381188/8461272
2806
2807 // Shift out everything but the sign bits
2808 uint16x8_t high_bits = vshrq_n_u16(a, 15);
2809
2810 // Merge the even lanes together with vsra. The '??' bytes are garbage.
2811 uint32x4_t paired32 = vsraq_n_u32(vreinterpretq_u32_u16(high_bits),
2812 vreinterpretq_u32_u16(high_bits), 15);
2813 // Repeat with wider lanes.
2814 uint64x2_t paired64 = vsraq_n_u64(vreinterpretq_u64_u32(paired32),
2815 vreinterpretq_u64_u32(paired32), 30);
2816 // Extract the low 4 bits from each lane and join.
2817 return (vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 0) & 0xf) |
2818 (vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 8) << 4);
2819}
2820
2821static SIMD_INLINE uint64_t msb2int(const Vec<Short, 16> &a)
2822{
2823 // the same as msb2int(Vec<Word,16>)
2824 return msb2int(reinterpret(a, OutputType<Word>()));
2825}
2826
2827static SIMD_INLINE uint64_t msb2int(const Vec<Int, 16> &a)
2828{
2829 // analogous to msb2int(Vec<Byte,16>)
2830 // idea from: https://stackoverflow.com/a/58381188/8461272
2831
2832 // Shift out everything but the sign bits
2833 uint32x4_t high_bits = vshrq_n_u32(vreinterpretq_u32_s32(a), 31);
2834
2835 // Merge the even lanes together with vsra. The '??' bytes are garbage.
2836 uint64x2_t paired64 = vsraq_n_u64(vreinterpretq_u64_u32(high_bits),
2837 vreinterpretq_u64_u32(high_bits), 31);
2838 // Extract the low 2 bits from each lane and join.
2839 return (vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 0) & 0x3) |
2840 ((vgetq_lane_u8(vreinterpretq_u8_u64(paired64), 8) & 0x3) << 2);
2841}
2842
2843static SIMD_INLINE uint64_t msb2int(const Vec<Float, 16> &a)
2844{
2845 // the same as msb2int(Vec<Int,16>)
2846 return msb2int(reinterpret(a, OutputType<Int>()));
2847}
2848
2849#ifdef SIMD_64BIT_TYPES
2850static SIMD_INLINE uint64_t msb2int(const Vec<Long, 16> &a)
2851{
2852 // shift out everything but the sign bits
2853 uint64x2_t high_bits = vshrq_n_u64(vreinterpretq_u64_s64(a), 63);
2854 // extract the low bit from each lane and join
2855 return vgetq_lane_u8(vreinterpretq_u8_u64(high_bits), 0) |
2856 (vgetq_lane_u8(vreinterpretq_u8_u64(high_bits), 8) << 1);
2857}
2858static SIMD_INLINE uint64_t msb2int(const Vec<Double, 16> &a)
2859{
2860 // the same as msb2int(Vec<Long,16>)
2861 return msb2int(reinterpret(a, OutputType<Long>()));
2862}
2863#endif
2864
2865// ---------------------------------------------------------------------------
2866// int2msb
2867// ---------------------------------------------------------------------------
2868
2869// 06. Oct 22 (Jonas Keller): added int2msb functions
2870
2871static SIMD_INLINE Vec<Byte, 16> int2msb(const uint64_t a, OutputType<Byte>,
2872 Integer<16>)
2873{
2874 uint8x8_t aVecLo = vdup_n_u8(a & 0xff);
2875 uint8x8_t aVecHi = vdup_n_u8((a >> 8) & 0xff);
2876 uint8x16_t aVec = vcombine_u8(aVecLo, aVecHi);
2877 // shift the bits to the msb
2878 int8x16_t shiftAmounts = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
2879 uint8x16_t shifted = vshlq_u8(aVec, shiftAmounts);
2880 return vandq_u8(shifted, vdupq_n_u8(0x80));
2881}
2882
2883static SIMD_INLINE Vec<SignedByte, 16> int2msb(const uint64_t a,
2884 OutputType<SignedByte>,
2885 Integer<16>)
2886{
2887 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<16>()),
2888 OutputType<SignedByte>());
2889}
2890
2891static SIMD_INLINE Vec<Word, 16> int2msb(const uint64_t a, OutputType<Word>,
2892 Integer<16>)
2893{
2894 uint16x8_t aVec = vdupq_n_u16(a & 0xff);
2895 // shift the bits to the msb
2896 int16x8_t shiftAmounts = {15, 14, 13, 12, 11, 10, 9, 8};
2897 uint16x8_t shifted = vshlq_u16(aVec, shiftAmounts);
2898 return vandq_u16(shifted, vdupq_n_u16(0x8000));
2899}
2900
2901static SIMD_INLINE Vec<Short, 16> int2msb(const uint64_t a, OutputType<Short>,
2902 Integer<16>)
2903{
2904 return reinterpret(int2msb(a, OutputType<Word>(), Integer<16>()),
2905 OutputType<Short>());
2906}
2907
2908static SIMD_INLINE Vec<Int, 16> int2msb(const uint64_t a, OutputType<Int>,
2909 Integer<16>)
2910{
2911 int32x4_t aVec = vdupq_n_s32(a & 0xf);
2912 // shift the bits to the msb
2913 int32x4_t shiftAmounts = {31, 30, 29, 28};
2914 int32x4_t shifted = vshlq_s32(aVec, shiftAmounts);
2915 return vandq_s32(shifted, vdupq_n_s32(0x80000000));
2916}
2917
2918static SIMD_INLINE Vec<Float, 16> int2msb(const uint64_t a, OutputType<Float>,
2919 Integer<16>)
2920{
2921 return reinterpret(int2msb(a, OutputType<Int>(), Integer<16>()),
2922 OutputType<Float>());
2923}
2924
2925#ifdef SIMD_64BIT_TYPES
2926static SIMD_INLINE Vec<Long, 16> int2msb(const uint64_t a, OutputType<Long>,
2927 Integer<16>)
2928{
2929 int64x2_t aVec = vdupq_n_s64(a & 0x3);
2930 // shift the bits to the msb
2931 int64x2_t shiftAmounts = {63, 62};
2932 int64x2_t shifted = vshlq_s64(aVec, shiftAmounts);
2933 int64x2_t result = vandq_s64(shifted, vdupq_n_s64(0x8000000000000000));
2934 return result;
2935}
2936static SIMD_INLINE Vec<Double, 16> int2msb(const uint64_t a, OutputType<Double>,
2937 Integer<16>)
2938{
2939 return reinterpret(int2msb(a, OutputType<Long>(), Integer<16>()),
2940 OutputType<Double>());
2941}
2942#endif
2943
2944// ---------------------------------------------------------------------------
2945// int2bits
2946// ---------------------------------------------------------------------------
2947
2948// 09. Oct 22 (Jonas Keller): added int2bits functions
2949
2950static SIMD_INLINE Vec<Byte, 16> int2bits(const uint64_t a, OutputType<Byte>,
2951 Integer<16>)
2952{
2953 uint8x8_t aVecLo = vdup_n_u8(a & 0xff);
2954 uint8x8_t aVecHi = vdup_n_u8((a >> 8) & 0xff);
2955 uint8x16_t aVec = vcombine_u8(aVecLo, aVecHi);
2956 uint8x16_t sel = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2957 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
2958 return vtstq_u8(aVec, sel);
2959}
2960
2961static SIMD_INLINE Vec<SignedByte, 16> int2bits(const uint64_t a,
2962 OutputType<SignedByte>,
2963 Integer<16>)
2964{
2965 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<16>()),
2966 OutputType<SignedByte>());
2967}
2968
2969static SIMD_INLINE Vec<Word, 16> int2bits(const uint64_t a, OutputType<Word>,
2970 Integer<16>)
2971{
2972 uint16x8_t aVec = vdupq_n_u16(a & 0xff);
2973 uint16x8_t sel = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
2974 return vtstq_u16(aVec, sel);
2975}
2976
2977static SIMD_INLINE Vec<Short, 16> int2bits(const uint64_t a, OutputType<Short>,
2978 Integer<16>)
2979{
2980 return reinterpret(int2bits(a, OutputType<Word>(), Integer<16>()),
2981 OutputType<Short>());
2982}
2983
2984static SIMD_INLINE Vec<Int, 16> int2bits(const uint64_t a, OutputType<Int>,
2985 Integer<16>)
2986{
2987 int32x4_t aVec = vdupq_n_s32(a & 0xf);
2988 int32x4_t sel = {0x01, 0x02, 0x04, 0x08};
2989 return vreinterpretq_s32_u32(vtstq_s32(aVec, sel));
2990}
2991
2992static SIMD_INLINE Vec<Float, 16> int2bits(const uint64_t a, OutputType<Float>,
2993 Integer<16>)
2994{
2995 return reinterpret(int2bits(a, OutputType<Int>(), Integer<16>()),
2996 OutputType<Float>());
2997}
2998
2999#ifdef SIMD_64BIT_TYPES
3000static SIMD_INLINE Vec<Long, 16> int2bits(const uint64_t a, OutputType<Long>,
3001 Integer<16>)
3002{
3003 int64x2_t aVec = vdupq_n_s64(a & 0xf);
3004 int64x2_t sel = {0x01, 0x02};
3005 return vreinterpretq_s64_u64(vtstq_s64(aVec, sel));
3006}
3007static SIMD_INLINE Vec<Double, 16> int2bits(const uint64_t a,
3008 OutputType<Double>, Integer<16>)
3009{
3010 return reinterpret(int2bits(a, OutputType<Long>(), Integer<16>()),
3011 OutputType<Double>());
3012}
3013#endif
3014
3015// ---------------------------------------------------------------------------
3016// iota
3017// ---------------------------------------------------------------------------
3018
3019// 30. Jan 23 (Jonas Keller): added iota
3020
3021static SIMD_INLINE Vec<Byte, 16> iota(OutputType<Byte>, Integer<16>)
3022{
3023 uint8x16_t res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3024 return res;
3025}
3026
3027static SIMD_INLINE Vec<SignedByte, 16> iota(OutputType<SignedByte>, Integer<16>)
3028{
3029 int8x16_t res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3030 return res;
3031}
3032
3033static SIMD_INLINE Vec<Word, 16> iota(OutputType<Word>, Integer<16>)
3034{
3035 uint16x8_t res = {0, 1, 2, 3, 4, 5, 6, 7};
3036 return res;
3037}
3038
3039static SIMD_INLINE Vec<Short, 16> iota(OutputType<Short>, Integer<16>)
3040{
3041 int16x8_t res = {0, 1, 2, 3, 4, 5, 6, 7};
3042 return res;
3043}
3044
3045static SIMD_INLINE Vec<Int, 16> iota(OutputType<Int>, Integer<16>)
3046{
3047 int32x4_t res = {0, 1, 2, 3};
3048 return res;
3049}
3050
3051static SIMD_INLINE Vec<Float, 16> iota(OutputType<Float>, Integer<16>)
3052{
3053 float32x4_t res = {0.0f, 1.0f, 2.0f, 3.0f};
3054 return res;
3055}
3056
3057#ifdef SIMD_64BIT_TYPES
3058static SIMD_INLINE Vec<Long, 16> iota(OutputType<Long>, Integer<16>)
3059{
3060 int64x2_t res = {0, 1};
3061 return res;
3062}
3063static SIMD_INLINE Vec<Double, 16> iota(OutputType<Double>, Integer<16>)
3064{
3065 float64x2_t res = {0.0, 1.0};
3066 return res;
3067}
3068#endif
3069} // namespace base
3070} // namespace internal
3071} // namespace simd
3072
3073#endif
3074
3075#endif // SIMD_VEC_BASE_IMPL_NEON_16_H_
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161