T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
base_impl_intel64.H
1// ===========================================================================
2//
3// encapsulation for AVX512 Intel vector extensions
4// inspired by Agner Fog's C++ Vector Class Library
5// http://www.agner.org/optimize/#vectorclass
6// (VCL License: GNU General Public License Version 3,
7// http://www.gnu.org/licenses/gpl-3.0.en.html)
8//
9// Changes to unpack, zip and unzip functions in 2022 by
10// Jan-Lukas Wolf (jawolf@techfak.uni-bielefeld.de)
11//
12// This source code file is part of the following software:
13//
14// - the low-level C++ template SIMD library
15// - the SIMD implementation of the MinWarping and the 2D-Warping methods
16// for local visual homing.
17//
18// The software is provided based on the accompanying license agreement in the
19// file LICENSE.md.
20// The software is provided "as is" without any warranty by the licensor and
21// without any liability of the licensor, and the software may not be
22// distributed by the licensee; see the license agreement for details.
23//
24// (C) Ralf Möller
25// Computer Engineering
26// Faculty of Technology
27// Bielefeld University
28// www.ti.uni-bielefeld.de
29//
30// ===========================================================================
31
32// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
33// namespace
34// 13. May 23 (Jonas Keller): added Double support
35
36#pragma once
37#ifndef SIMD_VEC_BASE_IMPL_INTEL_64_H_
38#define SIMD_VEC_BASE_IMPL_INTEL_64_H_
39
40#include "../alloc.H"
41#include "../defs.H"
42#include "../types.H"
43#include "../vec.H"
44#include "base_impl_intel16.H"
45#include "base_impl_intel32.H"
46#include "intrins_intel.H"
47
48#include <cstddef>
49#include <cstdint>
50#include <limits>
51#include <type_traits>
52
53#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_64_AVAIL_) && \
54 !defined(SIMDVEC_SANDBOX)
55
56namespace simd {
57
58// ===========================================================================
59// NOTES:
60//
61// - setting zero inside the function is not inefficient, see:
62// http://stackoverflow.com/questions/26807285/...
63// ...are-static-static-local-sse-avx-variables-blocking-a-xmm-ymm-register
64//
65// - for some data types (Int, Float) there are no saturated versions
66// of add/sub instructions; in this case we use the unsaturated version;
67// the user is responsible to avoid overflows
68// ===========================================================================
69
70// ===========================================================================
71// Vec integer specialization for AVX512 v
72// ===========================================================================
73
74// partial specialization for SIMD_WIDTH = 64
75template <typename T>
76class Vec<T, 64>
77{
78 __m512i zmm = _mm512_setzero_si512();
79
80public:
81 using Type = T;
82 static constexpr size_t elements = 64 / sizeof(T);
83 static constexpr size_t elems = elements;
84 static constexpr size_t bytes = 64;
85
86 Vec() = default;
87 Vec(const __m512i &x) { zmm = x; }
88 Vec &operator=(const __m512i &x)
89 {
90 zmm = x;
91 return *this;
92 }
93 operator __m512i() const { return zmm; }
94 // for avx512bw emulation
95 Vec(const Vec<T, 32> &lo, const Vec<T, 32> &hi)
96 {
97 zmm = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
98 }
99 SIMD_INLINE Vec<T, 32> lo() const { return _mm512_castsi512_si256(zmm); }
100 SIMD_INLINE Vec<T, 32> hi() const
101 {
102 return _mm512_extracti64x4_epi64(zmm, 1);
103 }
104 // 29. Nov 22 (Jonas Keller):
105 // defined operators new and delete to ensure proper alignment, since
106 // the default new and delete are not guaranteed to do so before C++17
107 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
108 void operator delete(void *p) { aligned_free(p); }
109 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
110 void operator delete[](void *p) { aligned_free(p); }
111 // 05. Sep 23 (Jonas Keller): added allocator
112 using allocator = aligned_allocator<Vec<T, bytes>, bytes>;
113};
114
115// ===========================================================================
116// Vec float specialization for AVX512 v
117// ===========================================================================
118
119template <>
120class Vec<Float, 64>
121{
122 __m512 zmm = _mm512_setzero_ps();
123
124public:
125 using Type = Float;
126 static constexpr size_t elements = 64 / sizeof(Float);
127 static constexpr size_t elems = elements;
128 static constexpr size_t bytes = 64;
129
130 Vec() = default;
131 Vec(const __m512 &x) { zmm = x; }
132 Vec &operator=(const __m512 &x)
133 {
134 zmm = x;
135 return *this;
136 }
137 operator __m512() const { return zmm; }
138 // for avx512bw emulation
139 Vec(const Vec<Float, 32> &lo, const Vec<Float, 32> &hi)
140 {
141 zmm = _mm512_castpd_ps(_mm512_insertf64x4(
142 _mm512_castps_pd(_mm512_castps256_ps512(lo)), _mm256_castps_pd(hi), 1));
143 }
144 SIMD_INLINE Vec<Float, 32> lo() const { return _mm512_castps512_ps256(zmm); }
145 // _mm512_extractf32x8_ps only in AVX512DQ
146 SIMD_INLINE Vec<Float, 32> hi() const
147 {
148 return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm), 1));
149 }
150 // 29. Nov 22 (Jonas Keller):
151 // defined operators new and delete to ensure proper alignment, since
152 // the default new and delete are not guaranteed to do so before C++17
153 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
154 void operator delete(void *p) { aligned_free(p); }
155 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
156 void operator delete[](void *p) { aligned_free(p); }
157 // 05. Sep 23 (Jonas Keller): added allocator
158 using allocator = aligned_allocator<Vec<Float, bytes>, bytes>;
159};
160
161// ===========================================================================
162// Vec double specialization for AVX512 v
163// ===========================================================================
164
165template <>
166class Vec<Double, 64>
167{
168 __m512d zmm;
169
170public:
171 using Type = Double;
172 static constexpr size_t elements = 64 / sizeof(Double);
173 static constexpr size_t elems = elements;
174 static constexpr size_t bytes = 64;
175
176 Vec() = default;
177 Vec(const __m512d &x) { zmm = x; }
178 Vec &operator=(const __m512d &x)
179 {
180 zmm = x;
181 return *this;
182 }
183 operator __m512d() const { return zmm; }
184 // for avx512bw emulation
185 Vec(const Vec<Double, 32> &lo, const Vec<Double, 32> &hi)
186 {
187 zmm = _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1);
188 }
189 SIMD_INLINE Vec<Double, 32> lo() const { return _mm512_castpd512_pd256(zmm); }
190 SIMD_INLINE Vec<Double, 32> hi() const
191 {
192 return _mm512_extractf64x4_pd(zmm, 1);
193 }
194 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
195 void operator delete(void *p) { aligned_free(p); }
196 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
197 void operator delete[](void *p) { aligned_free(p); }
198 using allocator = aligned_allocator<Vec<Double, bytes>, bytes>;
199};
200
201namespace internal {
202namespace base {
203
204// ===========================================================================
205// auxiliary functions
206// ===========================================================================
207
208// These functions either wrap intrinsics (e.g. to handle
209// immediate arguments as template parameter), or switch between
210// implementations with different SSE* extensions, or provide
211// altered or additional functionality.
212// Only for use in wrapper functions!
213
214// 01. Apr 23 (Jonas Keller): removed some not really necessary internal
215// wrapper functions and inlined them directly into where they were used
216
217// ---------------------------------------------------------------------------
218// alignr v
219// ---------------------------------------------------------------------------
220
221// 21. Apr 23 (Jonas Keller): replaced IMM range handling via tag dispatch
222// with static_assert, since we don't need the range handling anymore,
223// we just assert that IMM is in range
224
225template <size_t COUNT>
226static SIMD_INLINE __m512i x_mm512_alignr_epi8(__m512i h, __m512i l)
227{
228 static_assert(COUNT < 32, "");
229#ifdef __AVX512BW__
230 return _mm512_alignr_epi8(h, l, COUNT);
231#else
232 // non-avx512bw workarounds
233 // (easy since AVX512BW instructions operate on lanes anyhow)
234 const __m256i lo = _mm256_alignr_epi8(_mm512_castsi512_si256(h),
235 _mm512_castsi512_si256(l), COUNT);
236 const __m256i hi = _mm256_alignr_epi8(_mm512_extracti64x4_epi64(h, 1),
237 _mm512_extracti64x4_epi64(l, 1), COUNT);
238 return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
239#endif
240}
241
242// ---------------------------------------------------------------------------
243// transpose8x64 v
244// ---------------------------------------------------------------------------
245
246static SIMD_INLINE __m512i x_mm512_transpose8x64_epi64(__m512i a)
247{
248 return _mm512_permutexvar_epi64(_mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0), a);
249}
250
251// ---------------------------------------------------------------------------
252// evenodd8x64 v
253// ---------------------------------------------------------------------------
254
255static SIMD_INLINE __m512i x_mm512_evenodd8x64_epi64(__m512i a)
256{
257 return _mm512_permutexvar_epi64(_mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0), a);
258}
259
260// ---------------------------------------------------------------------------
261// binary functions with non-avx512bw workarounds v
262// ---------------------------------------------------------------------------
263
264#ifdef __AVX512BW__
265// avx512bw is available
266#define SIMD_X_BW_INT_BINFCT_64(INTRIN) \
267 static SIMD_INLINE __m512i x_mm512_##INTRIN(__m512i a, __m512i b) \
268 { \
269 return _mm512_##INTRIN(a, b); \
270 }
271#else
272// non-avx512bw workaround
273#define SIMD_X_BW_INT_BINFCT_64(INTRIN) \
274 static SIMD_INLINE __m512i x_mm512_##INTRIN(__m512i a, __m512i b) \
275 { \
276 const __m256i lo = \
277 _mm256_##INTRIN(_mm512_castsi512_si256(a), _mm512_castsi512_si256(b)); \
278 const __m256i hi = _mm256_##INTRIN(_mm512_extracti64x4_epi64(a, 1), \
279 _mm512_extracti64x4_epi64(b, 1)); \
280 return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); \
281 }
282#endif
283
284SIMD_X_BW_INT_BINFCT_64(unpacklo_epi8)
285SIMD_X_BW_INT_BINFCT_64(unpackhi_epi8)
286SIMD_X_BW_INT_BINFCT_64(unpacklo_epi16)
287SIMD_X_BW_INT_BINFCT_64(unpackhi_epi16)
288SIMD_X_BW_INT_BINFCT_64(shuffle_epi8)
289SIMD_X_BW_INT_BINFCT_64(packs_epi16)
290SIMD_X_BW_INT_BINFCT_64(packs_epi32)
291SIMD_X_BW_INT_BINFCT_64(packus_epi16)
292SIMD_X_BW_INT_BINFCT_64(packus_epi32)
293
294// ---------------------------------------------------------------------------
295// non-existing avx512 functions emulated via avx v
296// ---------------------------------------------------------------------------
297
298// ---------------------------------------------------------------------------
299// x_mm512_movm_epi32 v
300// ---------------------------------------------------------------------------
301
302// https://stackoverflow.com/questions/48099006/
303// different-semantic-of-comparison-intrinsic-instructions-in-avx512
304
305static SIMD_INLINE __m512i x_mm512_movm_epi32(__mmask16 k)
306{
307#ifdef __AVX512DQ__
308 return _mm512_movm_epi32(k);
309#else
310 return _mm512_maskz_mov_epi32(k, _mm512_set1_epi32(-1));
311#endif
312}
313
314// ---------------------------------------------------------------------------
315// x_mm512_movm_epi64 v
316// ---------------------------------------------------------------------------
317
318static SIMD_INLINE __m512i x_mm512_movm_epi64(__mmask8 k)
319{
320#ifdef __AVX512DQ__
321 return _mm512_movm_epi64(k);
322#else
323 return _mm512_maskz_mov_epi64(k, _mm512_set1_epi64(-1));
324#endif
325}
326
327// ###########################################################################
328// ###########################################################################
329// ###########################################################################
330
331// ===========================================================================
332// Vec template function specializations or overloading for AVX
333// ===========================================================================
334
335// ---------------------------------------------------------------------------
336// reinterpretation casts v
337// ---------------------------------------------------------------------------
338
339// 08. Apr 23 (Jonas Keller): used enable_if for cleaner implementation
340
341// between all integer types
342template <typename Tdst, typename Tsrc,
343 SIMD_ENABLE_IF((!std::is_same<Tdst, Tsrc>::value &&
344 std::is_integral<Tdst>::value &&
345 std::is_integral<Tsrc>::value))>
346static SIMD_INLINE Vec<Tdst, 64> reinterpret(const Vec<Tsrc, 64> &vec,
347 OutputType<Tdst>)
348{
349 // 26. Nov 22 (Jonas Keller): reinterpret_cast is technically undefined
350 // behavior, so just rewrapping the vector register in a new Vec instead
351 // return reinterpret_cast<const Vec<Tdst,64>&>(vec);
352 return Vec<Tdst, 64>(__m512i(vec));
353}
354
355// from float to any integer type
356template <typename Tdst, SIMD_ENABLE_IF((std::is_integral<Tdst>::value))>
357static SIMD_INLINE Vec<Tdst, 64> reinterpret(const Vec<Float, 64> &vec,
358 OutputType<Tdst>)
359{
360 return _mm512_castps_si512(vec);
361}
362
363// from any integer type to float
364template <typename Tsrc, SIMD_ENABLE_IF((std::is_integral<Tsrc>::value))>
365static SIMD_INLINE Vec<Float, 64> reinterpret(const Vec<Tsrc, 64> &vec,
366 OutputType<Float>)
367{
368 return _mm512_castsi512_ps(vec);
369}
370
371// from double to any integer type
372template <typename Tdst, SIMD_ENABLE_IF((std::is_integral<Tdst>::value))>
373static SIMD_INLINE Vec<Tdst, 64> reinterpret(const Vec<Double, 64> &vec,
374 OutputType<Tdst>)
375{
376 return _mm512_castpd_si512(vec);
377}
378
379// from any integer type to double
380template <typename Tsrc, SIMD_ENABLE_IF((std::is_integral<Tsrc>::value))>
381static SIMD_INLINE Vec<Double, 64> reinterpret(const Vec<Tsrc, 64> &vec,
382 OutputType<Double>)
383{
384 return _mm512_castsi512_pd(vec);
385}
386
387// from float to double
388static SIMD_INLINE Vec<Double, 64> reinterpret(const Vec<Float, 64> &vec,
389 OutputType<Double>)
390{
391 return _mm512_castps_pd(vec);
392}
393
394// from double to float
395static SIMD_INLINE Vec<Float, 64> reinterpret(const Vec<Double, 64> &vec,
396 OutputType<Float>)
397{
398 return _mm512_castpd_ps(vec);
399}
400
401// between identical types
402template <typename T>
403static SIMD_INLINE Vec<T, 64> reinterpret(const Vec<T, 64> &vec, OutputType<T>)
404{
405 return vec;
406}
407
408// ---------------------------------------------------------------------------
409// convert (without changes in the number of of elements) v
410// ---------------------------------------------------------------------------
411
412// conversion with saturation; we wanted to have a fast solution that
413// doesn't trigger the overflow which results in a negative two's
414// complement result ("invalid int32": 0x80000000); therefore we clamp
415// the positive values at the maximal positive float which is
416// convertible to int32 without overflow (0x7fffffbf = 2147483520);
417// negative values cannot overflow (they are clamped to invalid int
418// which is the most negative int32)
419static SIMD_INLINE Vec<Int, 64> cvts(const Vec<Float, 64> &a, OutputType<Int>)
420{
421 // TODO: analyze much more complex solution for cvts at
422 // TODO: http://stackoverflow.com/questions/9157373/
423 // TODO: most-efficient-way-to-convert-vector-of-float-to-vector-of-uint32
424 __m512 clip = _mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
425 return _mm512_cvtps_epi32(_mm512_min_ps(clip, a));
426}
427
428// saturation is not necessary in this case
429static SIMD_INLINE Vec<Float, 64> cvts(const Vec<Int, 64> &a, OutputType<Float>)
430{
431 return _mm512_cvtepi32_ps(a);
432}
433
434static SIMD_INLINE Vec<Long, 64> cvts(const Vec<Double, 64> &a,
435 OutputType<Long>)
436{
437 const auto clip = _mm512_set1_pd(MAX_POS_DOUBLE_CONVERTIBLE_TO_INT64);
438 const auto clipped = _mm512_min_pd(clip, a);
439#ifdef __AVX512DQ__
440 return _mm512_cvtpd_epi64(clipped);
441#else
442 // workaround from https://stackoverflow.com/a/41148578 only works for
443 // values in range [-2^52, 2^52]
444 // using serial workaround instead
445 // TODO: serial workaround is slow, find parallel workaround
446 Double tmpD[8] SIMD_ATTR_ALIGNED(64);
447 _mm512_store_pd(tmpD, clipped);
448 Long tmpL[8] SIMD_ATTR_ALIGNED(64);
449 for (size_t i = 0; i < 8; ++i) {
450 tmpL[i] = static_cast<Long>(std::rint(tmpD[i]));
451 }
452 return _mm512_load_si512((__m512i *) tmpL);
453#endif
454}
455
456static SIMD_INLINE Vec<Double, 64> cvts(const Vec<Long, 64> &a,
457 OutputType<Double>)
458{
459#ifdef __AVX512DQ__
460 return _mm512_cvtepi64_pd(a);
461#else
462#if 0
463 // workaround from https://stackoverflow.com/a/41148578 (int64_t -> double) (modified)
464 __m512i xH = _mm512_srai_epi32(a, 16);
465 xH = _mm512_and_si512(xH, _mm512_set1_epi32(0xffff0000));
466 xH = _mm512_add_epi64(
467 xH, _mm512_castpd_si512(_mm512_set1_pd(442721857769029238784.))); // 3*2^67
468 __m512i xL = _mm512_or_si512(
469 _mm512_and_si512(a, _mm512_set1_epi64(0x0000ffffffffffff)),
470 _mm512_castpd_si512(_mm512_set1_pd(0x0010000000000000))); // 2^52
471 __m512d f =
472 _mm512_sub_pd(_mm512_castsi512_pd(xH),
473 _mm512_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
474 return _mm512_add_pd(f, _mm512_castsi512_pd(xL));
475#else
476 // the workaround above does not work
477 // TODO: why?
478 // so we use a serial workaround instead
479 Long tmpL[8] SIMD_ATTR_ALIGNED(64);
480 _mm512_store_si512((__m512i *) tmpL, a);
481 Double tmpD[8] SIMD_ATTR_ALIGNED(64);
482 for (size_t i = 0; i < 8; ++i) { tmpD[i] = static_cast<Double>(tmpL[i]); }
483 return _mm512_load_pd(tmpD);
484#endif
485#endif
486}
487
488// ---------------------------------------------------------------------------
489// setzero v
490// ---------------------------------------------------------------------------
491
492template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
493static SIMD_INLINE Vec<T, 64> setzero(OutputType<T>, Integer<64>)
494{
495 return _mm512_setzero_si512();
496}
497
498static SIMD_INLINE Vec<Float, 64> setzero(OutputType<Float>, Integer<64>)
499{
500 return _mm512_setzero_ps();
501}
502
503static SIMD_INLINE Vec<Double, 64> setzero(OutputType<Double>, Integer<64>)
504{
505 return _mm512_setzero_pd();
506}
507
508// ---------------------------------------------------------------------------
509// set1 v
510// ---------------------------------------------------------------------------
511
512static SIMD_INLINE Vec<Byte, 64> set1(Byte a, Integer<64>)
513{
514 return _mm512_set1_epi8(a);
515}
516
517static SIMD_INLINE Vec<SignedByte, 64> set1(SignedByte a, Integer<64>)
518{
519 return _mm512_set1_epi8(a);
520}
521
522static SIMD_INLINE Vec<Word, 64> set1(Word a, Integer<64>)
523{
524 return _mm512_set1_epi16(a);
525}
526
527static SIMD_INLINE Vec<Short, 64> set1(Short a, Integer<64>)
528{
529 return _mm512_set1_epi16(a);
530}
531
532static SIMD_INLINE Vec<Int, 64> set1(Int a, Integer<64>)
533{
534 return _mm512_set1_epi32(a);
535}
536
537static SIMD_INLINE Vec<Long, 64> set1(Long a, Integer<64>)
538{
539 return _mm512_set1_epi64(a);
540}
541
542static SIMD_INLINE Vec<Float, 64> set1(Float a, Integer<64>)
543{
544 return _mm512_set1_ps(a);
545}
546
547static SIMD_INLINE Vec<Double, 64> set1(Double a, Integer<64>)
548{
549 return _mm512_set1_pd(a);
550}
551
552// ---------------------------------------------------------------------------
553// load v
554// ---------------------------------------------------------------------------
555
556template <typename T>
557static SIMD_INLINE Vec<T, 64> load(const T *const p, Integer<64>)
558{
559 // AVX load and store instructions need alignment to 64 byte
560 // (lower 6 bit need to be zero)
561 SIMD_CHECK_ALIGNMENT(p, 64);
562 return _mm512_load_si512((__m512i *) p);
563}
564
565static SIMD_INLINE Vec<Float, 64> load(const Float *const p, Integer<64>)
566{
567 // AVX load and store instructions need alignment to 64 byte
568 // (lower 6 bit need to be zero)
569 SIMD_CHECK_ALIGNMENT(p, 64);
570 return _mm512_load_ps(p);
571}
572
573static SIMD_INLINE Vec<Double, 64> load(const Double *const p, Integer<64>)
574{
575 // AVX load and store instructions need alignment to 64 byte
576 // (lower 6 bit need to be zero)
577 SIMD_CHECK_ALIGNMENT(p, 64);
578 return _mm512_load_pd(p);
579}
580
581// ---------------------------------------------------------------------------
582// loadu v
583// ---------------------------------------------------------------------------
584
585template <typename T>
586static SIMD_INLINE Vec<T, 64> loadu(const T *const p, Integer<64>)
587{
588 return _mm512_loadu_si512((__m512i *) p);
589}
590
591static SIMD_INLINE Vec<Float, 64> loadu(const Float *const p, Integer<64>)
592{
593 return _mm512_loadu_ps(p);
594}
595
596static SIMD_INLINE Vec<Double, 64> loadu(const Double *const p, Integer<64>)
597{
598 return _mm512_loadu_pd(p);
599}
600
601// ---------------------------------------------------------------------------
602// store v
603// ---------------------------------------------------------------------------
604
605// all integer versions
606template <typename T>
607static SIMD_INLINE void store(T *const p, const Vec<T, 64> &a)
608{
609 // AVX load and store instructions need alignment to 64 byte
610 // (lower 6 bit need to be zero)
611 SIMD_CHECK_ALIGNMENT(p, 64);
612 _mm512_store_si512((__m512i *) p, a);
613}
614
615// float version
616static SIMD_INLINE void store(Float *const p, const Vec<Float, 64> &a)
617{
618 // AVX load and store instructions need alignment to 64 byte
619 // (lower 6 bit need to be zero)
620 SIMD_CHECK_ALIGNMENT(p, 64);
621 _mm512_store_ps(p, a);
622}
623
624// double version
625static SIMD_INLINE void store(Double *const p, const Vec<Double, 64> &a)
626{
627 // AVX load and store instructions need alignment to 64 byte
628 // (lower 6 bit need to be zero)
629 SIMD_CHECK_ALIGNMENT(p, 64);
630 _mm512_store_pd(p, a);
631}
632
633// ---------------------------------------------------------------------------
634// storeu v
635// ---------------------------------------------------------------------------
636
637// all integer versions
638template <typename T>
639static SIMD_INLINE void storeu(T *const p, const Vec<T, 64> &a)
640{
641 _mm512_storeu_si512((__m512i *) p, a);
642}
643
644// float version
645static SIMD_INLINE void storeu(Float *const p, const Vec<Float, 64> &a)
646{
647 _mm512_storeu_ps(p, a);
648}
649
650// double version
651static SIMD_INLINE void storeu(Double *const p, const Vec<Double, 64> &a)
652{
653 _mm512_storeu_pd(p, a);
654}
655
656// ---------------------------------------------------------------------------
657// stream_store v
658// ---------------------------------------------------------------------------
659
660// all integer versions
661template <typename T>
662static SIMD_INLINE void stream_store(T *const p, const Vec<T, 64> &a)
663{
664 // AVX load and store instructions need alignment to 64 byte
665 // (lower 6 bit need to be zero)
666 SIMD_CHECK_ALIGNMENT(p, 64);
667 _mm512_stream_si512((__m512i *) p, a);
668}
669
670// float version
671static SIMD_INLINE void stream_store(Float *const p, const Vec<Float, 64> &a)
672{
673 // AVX load and store instructions need alignment to 64 byte
674 // (lower 6 bit need to be zero)
675 SIMD_CHECK_ALIGNMENT(p, 64);
676 _mm512_stream_ps(p, a);
677}
678
679// double version
680static SIMD_INLINE void stream_store(Double *const p, const Vec<Double, 64> &a)
681{
682 // AVX load and store instructions need alignment to 64 byte
683 // (lower 6 bit need to be zero)
684 SIMD_CHECK_ALIGNMENT(p, 64);
685 _mm512_stream_pd(p, a);
686}
687
688// ---------------------------------------------------------------------------
689// extract v
690// ---------------------------------------------------------------------------
691
692template <size_t COUNT>
693static SIMD_INLINE Byte extract(const Vec<Byte, 64> &a)
694{
695 SIMD_IF_CONSTEXPR (COUNT < 64) {
696 return _mm_extract_epi8(_mm512_extracti32x4_epi32(a, COUNT >> 4),
697 COUNT % 16);
698 } else {
699 return 0;
700 }
701}
702
703template <size_t COUNT>
704static SIMD_INLINE SignedByte extract(const Vec<SignedByte, 64> &a)
705{
706 SIMD_IF_CONSTEXPR (COUNT < 64) {
707 return _mm_extract_epi8(_mm512_extracti32x4_epi32(a, COUNT >> 4),
708 COUNT % 16);
709 } else {
710 return 0;
711 }
712}
713
714template <size_t COUNT>
715static SIMD_INLINE Word extract(const Vec<Word, 64> &a)
716{
717 SIMD_IF_CONSTEXPR (COUNT < 32) {
718 return _mm_extract_epi16(_mm512_extracti32x4_epi32(a, COUNT >> 3),
719 COUNT % 8);
720 } else {
721 return 0;
722 }
723}
724
725template <size_t COUNT>
726static SIMD_INLINE Short extract(const Vec<Short, 64> &a)
727{
728 SIMD_IF_CONSTEXPR (COUNT < 32) {
729 return _mm_extract_epi16(_mm512_extracti32x4_epi32(a, COUNT >> 3),
730 COUNT % 8);
731 } else {
732 return 0;
733 }
734}
735
736template <size_t COUNT>
737static SIMD_INLINE Int extract(const Vec<Int, 64> &a)
738{
739 SIMD_IF_CONSTEXPR (COUNT < 16) {
740 return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, COUNT >> 2),
741 COUNT % 4);
742 } else {
743 return 0;
744 }
745}
746
747template <size_t COUNT>
748static SIMD_INLINE Long extract(const Vec<Long, 64> &a)
749{
750 SIMD_IF_CONSTEXPR (COUNT < 8) {
751 return _mm_extract_epi64(_mm512_extracti32x4_epi32(a, COUNT >> 1),
752 COUNT % 2);
753 } else {
754 return 0;
755 }
756}
757
758template <size_t COUNT>
759static SIMD_INLINE Float extract(const Vec<Float, 64> &a)
760{
761 SIMD_IF_CONSTEXPR (COUNT < 16) {
762 return ::simd::internal::bit_cast<Float>(
763 _mm_extract_ps(_mm512_extractf32x4_ps(a, COUNT >> 2), COUNT % 4));
764 } else {
765 return 0;
766 }
767}
768
769template <size_t COUNT>
770static SIMD_INLINE Double extract(const Vec<Double, 64> &a)
771{
772 SIMD_IF_CONSTEXPR (COUNT < 8) {
773 return ::simd::internal::bit_cast<Double>(_mm_extract_epi64(
774 _mm512_extracti32x4_epi32(_mm512_castpd_si512(a), COUNT >> 1),
775 COUNT % 2));
776 } else {
777 return 0;
778 }
779}
780
781// ---------------------------------------------------------------------------
782// extract 128-bit-lane as Vec<T, 16>
783// ---------------------------------------------------------------------------
784
785// contributed by Adam Marschall
786
787// generalized extract of 128-bit-lanes (LANE_INDEX = 0..3)
788template <size_t LANE_INDEX, typename T>
789static SIMD_INLINE Vec<T, 16> extractLane(const Vec<T, 64> &a)
790{
791 const auto intA = reinterpret(a, OutputType<Int>());
792 const Vec<Int, 16> intRes = _mm512_extracti32x4_epi32(intA, LANE_INDEX);
793 return reinterpret(intRes, OutputType<T>());
794}
795
796// ---------------------------------------------------------------------------
797// add v
798// ---------------------------------------------------------------------------
799
800#ifdef __AVX512BW__
801
802static SIMD_INLINE Vec<Byte, 64> add(const Vec<Byte, 64> &a,
803 const Vec<Byte, 64> &b)
804{
805 return _mm512_add_epi8(a, b);
806}
807
808static SIMD_INLINE Vec<SignedByte, 64> add(const Vec<SignedByte, 64> &a,
809 const Vec<SignedByte, 64> &b)
810{
811 return _mm512_add_epi8(a, b);
812}
813
814static SIMD_INLINE Vec<Word, 64> add(const Vec<Word, 64> &a,
815 const Vec<Word, 64> &b)
816{
817 return _mm512_add_epi16(a, b);
818}
819
820static SIMD_INLINE Vec<Short, 64> add(const Vec<Short, 64> &a,
821 const Vec<Short, 64> &b)
822{
823 return _mm512_add_epi16(a, b);
824}
825
826#else
827
828// non-avx512bw workaround
829template <typename T>
830static SIMD_INLINE Vec<T, 64> add(const Vec<T, 64> &a, const Vec<T, 64> &b)
831{
832 return Vec<T, 64>(add(a.lo(), b.lo()), add(a.hi(), b.hi()));
833}
834
835#endif
836
837static SIMD_INLINE Vec<Int, 64> add(const Vec<Int, 64> &a,
838 const Vec<Int, 64> &b)
839{
840 return _mm512_add_epi32(a, b);
841}
842
843static SIMD_INLINE Vec<Long, 64> add(const Vec<Long, 64> &a,
844 const Vec<Long, 64> &b)
845{
846 return _mm512_add_epi64(a, b);
847}
848
849static SIMD_INLINE Vec<Float, 64> add(const Vec<Float, 64> &a,
850 const Vec<Float, 64> &b)
851{
852 return _mm512_add_ps(a, b);
853}
854
855static SIMD_INLINE Vec<Double, 64> add(const Vec<Double, 64> &a,
856 const Vec<Double, 64> &b)
857{
858 return _mm512_add_pd(a, b);
859}
860
861// ---------------------------------------------------------------------------
862// adds
863// ---------------------------------------------------------------------------
864
865#ifdef __AVX512BW__
866
867static SIMD_INLINE Vec<Byte, 64> adds(const Vec<Byte, 64> &a,
868 const Vec<Byte, 64> &b)
869{
870 return _mm512_adds_epu8(a, b);
871}
872
873static SIMD_INLINE Vec<SignedByte, 64> adds(const Vec<SignedByte, 64> &a,
874 const Vec<SignedByte, 64> &b)
875{
876 return _mm512_adds_epi8(a, b);
877}
878
879static SIMD_INLINE Vec<Word, 64> adds(const Vec<Word, 64> &a,
880 const Vec<Word, 64> &b)
881{
882 return _mm512_adds_epu16(a, b);
883}
884
885static SIMD_INLINE Vec<Short, 64> adds(const Vec<Short, 64> &a,
886 const Vec<Short, 64> &b)
887{
888 return _mm512_adds_epi16(a, b);
889}
890
891#else
892
893// non-avx512bw workaround
894template <typename T>
895static SIMD_INLINE Vec<T, 64> adds(const Vec<T, 64> &a, const Vec<T, 64> &b)
896{
897 return Vec<T, 64>(adds(a.lo(), b.lo()), adds(a.hi(), b.hi()));
898}
899
900#endif
901
902static SIMD_INLINE Vec<Int, 64> adds(const Vec<Int, 64> &a,
903 const Vec<Int, 64> &b)
904{
905 // 09. Mar 23 (Jonas Keller): added workaround so that this function is
906 // saturated
907
908 // _mm512_adds_epi32 does not exist, workaround:
909 // Hacker's Delight, 2-13 Overflow Detection: "Signed integer overflow of
910 // addition occurs if and only if the operands have the same sign and the
911 // sum has a sign opposite to that of the operands."
912 const __m512i sum = _mm512_add_epi32(a, b);
913 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
914 const __m512i sumHasDiffSign = _mm512_xor_si512(a, sum);
915 // indicates when an overflow has occurred
916 const __m512i overflow =
917 _mm512_srai_epi32(_mm512_andnot_si512(opsHaveDiffSign, sumHasDiffSign), 31);
918 // saturated sum for if overflow occurred (0x7FFFFFFF=max positive int, when
919 // sign of a (and thus b as well) is 0, 0x80000000=min negative int, when sign
920 // of a (and thus b as well) is 1)
921 const __m512i saturatedSum =
922 _mm512_xor_si512(_mm512_srai_epi32(a, 31), _mm512_set1_epi32(0x7FFFFFFF));
923 // return saturated sum if overflow occurred, otherwise return sum
924 return _mm512_or_si512(_mm512_andnot_si512(overflow, sum),
925 _mm512_and_si512(overflow, saturatedSum));
926}
927
928static SIMD_INLINE Vec<Long, 64> adds(const Vec<Long, 64> &a,
929 const Vec<Long, 64> &b)
930{
931 // _mm512_adds_epi64 does not exist, workaround:
932 // Hacker's Delight, 2-13 Overflow Detection: "Signed integer overflow of
933 // addition occurs if and only if the operands have the same sign and the
934 // sum has a sign opposite to that of the operands."
935 const __m512i sum = _mm512_add_epi64(a, b);
936 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
937 const __m512i sumHasDiffSign = _mm512_xor_si512(a, sum);
938 // indicates when an overflow has occurred
939 const __m512i overflow =
940 _mm512_srai_epi64(_mm512_andnot_si512(opsHaveDiffSign, sumHasDiffSign), 63);
941 // saturated sum for if overflow occurred (0x7FFFFFFFFFFFFFFF=max positive
942 // long, when sign of a (and thus b as well) is 0, 0x8000000000000000=min
943 // negative long, when sign of a (and thus b as well) is 1)
944 const __m512i saturatedSum = _mm512_xor_si512(
945 _mm512_srai_epi64(a, 63), _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
946 // return saturated sum if overflow occurred, otherwise return sum
947 return _mm512_or_si512(_mm512_andnot_si512(overflow, sum),
948 _mm512_and_si512(overflow, saturatedSum));
949}
950
951// Float not saturated
952static SIMD_INLINE Vec<Float, 64> adds(const Vec<Float, 64> &a,
953 const Vec<Float, 64> &b)
954{
955 return _mm512_add_ps(a, b);
956}
957
958// Double not saturated
959static SIMD_INLINE Vec<Double, 64> adds(const Vec<Double, 64> &a,
960 const Vec<Double, 64> &b)
961{
962 return _mm512_add_pd(a, b);
963}
964
965// ---------------------------------------------------------------------------
966// sub v
967// ---------------------------------------------------------------------------
968
969#ifdef __AVX512BW__
970
971static SIMD_INLINE Vec<Byte, 64> sub(const Vec<Byte, 64> &a,
972 const Vec<Byte, 64> &b)
973{
974 return _mm512_sub_epi8(a, b);
975}
976
977static SIMD_INLINE Vec<SignedByte, 64> sub(const Vec<SignedByte, 64> &a,
978 const Vec<SignedByte, 64> &b)
979{
980 return _mm512_sub_epi8(a, b);
981}
982
983static SIMD_INLINE Vec<Word, 64> sub(const Vec<Word, 64> &a,
984 const Vec<Word, 64> &b)
985{
986 return _mm512_sub_epi16(a, b);
987}
988
989static SIMD_INLINE Vec<Short, 64> sub(const Vec<Short, 64> &a,
990 const Vec<Short, 64> &b)
991{
992 return _mm512_sub_epi16(a, b);
993}
994
995#else
996
997// non-avx512bw workaround
998template <typename T>
999static SIMD_INLINE Vec<T, 64> sub(const Vec<T, 64> &a, const Vec<T, 64> &b)
1000{
1001 return Vec<T, 64>(sub(a.lo(), b.lo()), sub(a.hi(), b.hi()));
1002}
1003
1004#endif
1005
1006static SIMD_INLINE Vec<Int, 64> sub(const Vec<Int, 64> &a,
1007 const Vec<Int, 64> &b)
1008{
1009 return _mm512_sub_epi32(a, b);
1010}
1011
1012static SIMD_INLINE Vec<Long, 64> sub(const Vec<Long, 64> &a,
1013 const Vec<Long, 64> &b)
1014{
1015 return _mm512_sub_epi64(a, b);
1016}
1017
1018static SIMD_INLINE Vec<Float, 64> sub(const Vec<Float, 64> &a,
1019 const Vec<Float, 64> &b)
1020{
1021 return _mm512_sub_ps(a, b);
1022}
1023
1024static SIMD_INLINE Vec<Double, 64> sub(const Vec<Double, 64> &a,
1025 const Vec<Double, 64> &b)
1026{
1027 return _mm512_sub_pd(a, b);
1028}
1029
1030// ---------------------------------------------------------------------------
1031// subs
1032// ---------------------------------------------------------------------------
1033
1034#ifdef __AVX512BW__
1035
1036static SIMD_INLINE Vec<Byte, 64> subs(const Vec<Byte, 64> &a,
1037 const Vec<Byte, 64> &b)
1038{
1039 return _mm512_subs_epu8(a, b);
1040}
1041
1042static SIMD_INLINE Vec<SignedByte, 64> subs(const Vec<SignedByte, 64> &a,
1043 const Vec<SignedByte, 64> &b)
1044{
1045 return _mm512_subs_epi8(a, b);
1046}
1047
1048static SIMD_INLINE Vec<Word, 64> subs(const Vec<Word, 64> &a,
1049 const Vec<Word, 64> &b)
1050{
1051 return _mm512_subs_epu16(a, b);
1052}
1053
1054static SIMD_INLINE Vec<Short, 64> subs(const Vec<Short, 64> &a,
1055 const Vec<Short, 64> &b)
1056{
1057 return _mm512_subs_epi16(a, b);
1058}
1059
1060#else
1061
1062// non-avx512bw workaround
1063template <typename T>
1064static SIMD_INLINE Vec<T, 64> subs(const Vec<T, 64> &a, const Vec<T, 64> &b)
1065{
1066 return Vec<T, 64>(subs(a.lo(), b.lo()), subs(a.hi(), b.hi()));
1067}
1068
1069#endif
1070
1071static SIMD_INLINE Vec<Int, 64> subs(const Vec<Int, 64> &a,
1072 const Vec<Int, 64> &b)
1073{
1074 // 09. Mar 23 (Jonas Keller): added workaround so that this function is
1075 // saturated
1076
1077 // _mm512_subs_epi32 does not exist, workaround:
1078 // Hacker's Delight, 2-13 Overflow Detection: "[...] overflow in the final
1079 // value of x−y [...] occurs if and only if x and y have opposite signs and
1080 // the sign of x−y [...] is opposite to that of x [...]"
1081 const __m512i diff = _mm512_sub_epi32(a, b);
1082 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
1083 const __m512i diffHasDiffSign = _mm512_xor_si512(a, diff);
1084 // indicates when an overflow has occurred
1085 const __m512i overflow =
1086 _mm512_srai_epi32(_mm512_and_si512(opsHaveDiffSign, diffHasDiffSign), 31);
1087 // saturated diff for if overflow occurred (0x7FFFFFFF=max positive int, when
1088 // sign of a (and thus b as well) is 0, 0x80000000=min negative int, when sign
1089 // of a (and thus b as well) is 1)
1090 const __m512i saturatedDiff =
1091 _mm512_xor_si512(_mm512_srai_epi32(a, 31), _mm512_set1_epi32(0x7FFFFFFF));
1092 // return saturated diff if overflow occurred, otherwise return diff
1093 return _mm512_or_si512(_mm512_andnot_si512(overflow, diff),
1094 _mm512_and_si512(overflow, saturatedDiff));
1095}
1096
1097static SIMD_INLINE Vec<Long, 64> subs(const Vec<Long, 64> &a,
1098 const Vec<Long, 64> &b)
1099{
1100 // _mm512_subs_epi64 does not exist, workaround:
1101 // Hacker's Delight, 2-13 Overflow Detection: "[...] overflow in the final
1102 // value of x−y [...] occurs if and only if x and y have opposite signs and
1103 // the sign of x−y [...] is opposite to that of x [...]"
1104 const __m512i diff = _mm512_sub_epi64(a, b);
1105 const __m512i opsHaveDiffSign = _mm512_xor_si512(a, b);
1106 const __m512i diffHasDiffSign = _mm512_xor_si512(a, diff);
1107 // indicates when an overflow has occurred
1108 const __m512i overflow =
1109 _mm512_srai_epi64(_mm512_and_si512(opsHaveDiffSign, diffHasDiffSign), 63);
1110 // saturated diff for if overflow occurred (0x7FFFFFFFFFFFFFFF=max positive
1111 // long, when sign of a (and thus b as well) is 0, 0x8000000000000000=min
1112 // negative long, when sign of a (and thus b as well) is 1)
1113 const __m512i saturatedDiff = _mm512_xor_si512(
1114 _mm512_srai_epi64(a, 63), _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
1115 // return saturated diff if overflow occurred, otherwise return diff
1116 return _mm512_or_si512(_mm512_andnot_si512(overflow, diff),
1117 _mm512_and_si512(overflow, saturatedDiff));
1118}
1119
1120// Float not saturated
1121static SIMD_INLINE Vec<Float, 64> subs(const Vec<Float, 64> &a,
1122 const Vec<Float, 64> &b)
1123{
1124 return _mm512_sub_ps(a, b);
1125}
1126
1127// Double not saturated
1128static SIMD_INLINE Vec<Double, 64> subs(const Vec<Double, 64> &a,
1129 const Vec<Double, 64> &b)
1130{
1131 return _mm512_sub_pd(a, b);
1132}
1133
1134// ---------------------------------------------------------------------------
1135// neg (negate = two's complement or unary minus), only signed types v
1136// ---------------------------------------------------------------------------
1137
1138#ifdef __AVX512BW__
1139
1140static SIMD_INLINE Vec<SignedByte, 64> neg(const Vec<SignedByte, 64> &a)
1141{
1142 return _mm512_sub_epi8(_mm512_setzero_si512(), a);
1143}
1144
1145static SIMD_INLINE Vec<Short, 64> neg(const Vec<Short, 64> &a)
1146{
1147 return _mm512_sub_epi16(_mm512_setzero_si512(), a);
1148}
1149
1150#else
1151
1152// non-avx512bw workaround
1153template <typename T>
1154static SIMD_INLINE Vec<T, 64> neg(const Vec<T, 64> &a)
1155{
1156 return Vec<T, 64>(neg(a.lo()), neg(a.hi()));
1157}
1158
1159#endif
1160
1161static SIMD_INLINE Vec<Int, 64> neg(const Vec<Int, 64> &a)
1162{
1163 return _mm512_sub_epi32(_mm512_setzero_si512(), a);
1164}
1165
1166static SIMD_INLINE Vec<Long, 64> neg(const Vec<Long, 64> &a)
1167{
1168 return _mm512_sub_epi64(_mm512_setzero_si512(), a);
1169}
1170
1171static SIMD_INLINE Vec<Float, 64> neg(const Vec<Float, 64> &a)
1172{
1173 // xor has better latency than sub
1174 return _mm512_castsi512_ps(
1175 _mm512_xor_si512(_mm512_set1_epi32(0x80000000), _mm512_castps_si512(a)));
1176}
1177
1178static SIMD_INLINE Vec<Double, 64> neg(const Vec<Double, 64> &a)
1179{
1180 // xor has better latency than sub
1181 return _mm512_castsi512_pd(_mm512_xor_si512(
1182 _mm512_set1_epi64(0x8000000000000000), _mm512_castpd_si512(a)));
1183}
1184
1185// ---------------------------------------------------------------------------
1186// min v
1187// ---------------------------------------------------------------------------
1188
1189#ifdef __AVX512BW__
1190
1191static SIMD_INLINE Vec<Byte, 64> min(const Vec<Byte, 64> &a,
1192 const Vec<Byte, 64> &b)
1193{
1194 return _mm512_min_epu8(a, b);
1195}
1196
1197static SIMD_INLINE Vec<SignedByte, 64> min(const Vec<SignedByte, 64> &a,
1198 const Vec<SignedByte, 64> &b)
1199{
1200 return _mm512_min_epi8(a, b);
1201}
1202
1203static SIMD_INLINE Vec<Word, 64> min(const Vec<Word, 64> &a,
1204 const Vec<Word, 64> &b)
1205{
1206 return _mm512_min_epu16(a, b);
1207}
1208
1209static SIMD_INLINE Vec<Short, 64> min(const Vec<Short, 64> &a,
1210 const Vec<Short, 64> &b)
1211{
1212 return _mm512_min_epi16(a, b);
1213}
1214
1215#else
1216
1217// non-avx512bw workaround
1218template <typename T>
1219static SIMD_INLINE Vec<T, 64> min(const Vec<T, 64> &a, const Vec<T, 64> &b)
1220{
1221 return Vec<T, 64>(min(a.lo(), b.lo()), min(a.hi(), b.hi()));
1222}
1223
1224#endif
1225
1226static SIMD_INLINE Vec<Int, 64> min(const Vec<Int, 64> &a,
1227 const Vec<Int, 64> &b)
1228{
1229 return _mm512_min_epi32(a, b);
1230}
1231
1232// there is an unsigned version of min for 32 bit but we currently
1233// don't have an element type for it
1234
1235static SIMD_INLINE Vec<Long, 64> min(const Vec<Long, 64> &a,
1236 const Vec<Long, 64> &b)
1237{
1238 return _mm512_min_epi64(a, b);
1239}
1240
1241static SIMD_INLINE Vec<Float, 64> min(const Vec<Float, 64> &a,
1242 const Vec<Float, 64> &b)
1243{
1244 return _mm512_min_ps(a, b);
1245}
1246
1247static SIMD_INLINE Vec<Double, 64> min(const Vec<Double, 64> &a,
1248 const Vec<Double, 64> &b)
1249{
1250 return _mm512_min_pd(a, b);
1251}
1252
1253// ---------------------------------------------------------------------------
1254// max v
1255// ---------------------------------------------------------------------------
1256
1257#ifdef __AVX512BW__
1258
1259static SIMD_INLINE Vec<Byte, 64> max(const Vec<Byte, 64> &a,
1260 const Vec<Byte, 64> &b)
1261{
1262 return _mm512_max_epu8(a, b);
1263}
1264
1265static SIMD_INLINE Vec<SignedByte, 64> max(const Vec<SignedByte, 64> &a,
1266 const Vec<SignedByte, 64> &b)
1267{
1268 return _mm512_max_epi8(a, b);
1269}
1270
1271static SIMD_INLINE Vec<Word, 64> max(const Vec<Word, 64> &a,
1272 const Vec<Word, 64> &b)
1273{
1274 return _mm512_max_epu16(a, b);
1275}
1276
1277static SIMD_INLINE Vec<Short, 64> max(const Vec<Short, 64> &a,
1278 const Vec<Short, 64> &b)
1279{
1280 return _mm512_max_epi16(a, b);
1281}
1282
1283#else
1284
1285// non-avx512bw workaround
1286template <typename T>
1287static SIMD_INLINE Vec<T, 64> max(const Vec<T, 64> &a, const Vec<T, 64> &b)
1288{
1289 return Vec<T, 64>(max(a.lo(), b.lo()), max(a.hi(), b.hi()));
1290}
1291
1292#endif
1293
1294static SIMD_INLINE Vec<Int, 64> max(const Vec<Int, 64> &a,
1295 const Vec<Int, 64> &b)
1296{
1297 return _mm512_max_epi32(a, b);
1298}
1299
1300// there is an unsigned version of max for 32 bit but we currently
1301// don't have an element type for it
1302
1303static SIMD_INLINE Vec<Long, 64> max(const Vec<Long, 64> &a,
1304 const Vec<Long, 64> &b)
1305{
1306 return _mm512_max_epi64(a, b);
1307}
1308
1309static SIMD_INLINE Vec<Float, 64> max(const Vec<Float, 64> &a,
1310 const Vec<Float, 64> &b)
1311{
1312 return _mm512_max_ps(a, b);
1313}
1314
1315static SIMD_INLINE Vec<Double, 64> max(const Vec<Double, 64> &a,
1316 const Vec<Double, 64> &b)
1317{
1318 return _mm512_max_pd(a, b);
1319}
1320
1321// ---------------------------------------------------------------------------
1322// mul, div v
1323// ---------------------------------------------------------------------------
1324
1325// TODO: add mul/div versions for int types? or make special versions of mul
1326// TODO: and div where the result is scaled?
1327
1328static SIMD_INLINE Vec<Float, 64> mul(const Vec<Float, 64> &a,
1329 const Vec<Float, 64> &b)
1330{
1331 return _mm512_mul_ps(a, b);
1332}
1333
1334static SIMD_INLINE Vec<Double, 64> mul(const Vec<Double, 64> &a,
1335 const Vec<Double, 64> &b)
1336{
1337 return _mm512_mul_pd(a, b);
1338}
1339
1340static SIMD_INLINE Vec<Float, 64> div(const Vec<Float, 64> &a,
1341 const Vec<Float, 64> &b)
1342{
1343 return _mm512_div_ps(a, b);
1344}
1345
1346static SIMD_INLINE Vec<Double, 64> div(const Vec<Double, 64> &a,
1347 const Vec<Double, 64> &b)
1348{
1349 return _mm512_div_pd(a, b);
1350}
1351
1352// ---------------------------------------------------------------------------
1353// ceil, floor, round, truncate v
1354// ---------------------------------------------------------------------------
1355
1356// 25. Mar 23 (Jonas Keller): added versions for integer types
1357
1358// versions for integer types do nothing:
1359
1360template <typename T>
1361static SIMD_INLINE Vec<T, 64> ceil(const Vec<T, 64> &a)
1362{
1363 static_assert(std::is_integral<T>::value, "");
1364 return a;
1365}
1366
1367template <typename T>
1368static SIMD_INLINE Vec<T, 64> floor(const Vec<T, 64> &a)
1369{
1370 static_assert(std::is_integral<T>::value, "");
1371 return a;
1372}
1373
1374template <typename T>
1375static SIMD_INLINE Vec<T, 64> round(const Vec<T, 64> &a)
1376{
1377 static_assert(std::is_integral<T>::value, "");
1378 return a;
1379}
1380
1381template <typename T>
1382static SIMD_INLINE Vec<T, 64> truncate(const Vec<T, 64> &a)
1383{
1384 static_assert(std::is_integral<T>::value, "");
1385 return a;
1386}
1387
1388// see Peter Cordes at https://stackoverflow.com/questions/50854991
1389// _mm512_roundscale_ps:
1390// imm[7:4] = fraction bits = here 0, imm[0:1] = rounding mode
1391
1392static SIMD_INLINE Vec<Float, 64> ceil(const Vec<Float, 64> &a)
1393{
1394 return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
1395}
1396
1397static SIMD_INLINE Vec<Double, 64> ceil(const Vec<Double, 64> &a)
1398{
1399 return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
1400}
1401
1402static SIMD_INLINE Vec<Float, 64> floor(const Vec<Float, 64> &a)
1403{
1404 return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
1405}
1406
1407static SIMD_INLINE Vec<Double, 64> floor(const Vec<Double, 64> &a)
1408{
1409 return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
1410}
1411
1412static SIMD_INLINE Vec<Float, 64> round(const Vec<Float, 64> &a)
1413{
1414 return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1415}
1416
1417static SIMD_INLINE Vec<Double, 64> round(const Vec<Double, 64> &a)
1418{
1419 return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1420}
1421
1422static SIMD_INLINE Vec<Float, 64> truncate(const Vec<Float, 64> &a)
1423{
1424 return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1425}
1426
1427static SIMD_INLINE Vec<Double, 64> truncate(const Vec<Double, 64> &a)
1428{
1429 return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1430}
1431
1432// ---------------------------------------------------------------------------
1433// elementary mathematical functions v
1434// ---------------------------------------------------------------------------
1435
1436// estimate of a reciprocal
1437// NOTE: this has better precision than SSE and AVX versions!
1438
1439// float version
1440static SIMD_INLINE Vec<Float, 64> rcp(const Vec<Float, 64> &a)
1441{
1442 // 20. Mar 23 (Jonas Keller):
1443 // use _mm512_rcp28_ps if available, which has even better precision
1444 // and does not seem to be any slower (at least according to this:
1445 // https://github.com/tanakamura/instruction-bench/blob/master/knl.log)
1446#ifdef __AVX512ER__
1447 return _mm512_rcp28_ps(a);
1448#else
1449 return _mm512_rcp14_ps(a);
1450#endif
1451}
1452
1453// double version
1454static SIMD_INLINE Vec<Double, 64> rcp(const Vec<Double, 64> &a)
1455{
1456 // use _mm512_rcp28_pd if available, which has even better precision
1457 // and does not seem to be any slower (at least according to this:
1458 // https://github.com/tanakamura/instruction-bench/blob/master/knl.log)
1459#ifdef __AVX512ER__
1460 return _mm512_rcp28_pd(a);
1461#else
1462 return _mm512_rcp14_pd(a);
1463#endif
1464}
1465
1466// estimate of reverse square root
1467// NOTE: this has better precision than SSE and AVX versions!
1468
1469// float version
1470static SIMD_INLINE Vec<Float, 64> rsqrt(const Vec<Float, 64> &a)
1471{
1472 // 20. Mar 23 (Jonas Keller):
1473 // use _mm512_rsqrt28_ps if available, which has even better precision
1474 // and does not seem to be any slower (probably)
1475#ifdef __AVX512ER__
1476 return _mm512_rsqrt28_ps(a);
1477#else
1478 return _mm512_rsqrt14_ps(a);
1479#endif
1480}
1481
1482// double version
1483static SIMD_INLINE Vec<Double, 64> rsqrt(const Vec<Double, 64> &a)
1484{
1485 // use _mm512_rsqrt28_pd if available, which has even better precision
1486 // and does not seem to be any slower (probably)
1487#ifdef __AVX512ER__
1488 return _mm512_rsqrt28_pd(a);
1489#else
1490 return _mm512_rsqrt14_pd(a);
1491#endif
1492}
1493
1494// square root
1495
1496// float version
1497static SIMD_INLINE Vec<Float, 64> sqrt(const Vec<Float, 64> &a)
1498{
1499 return _mm512_sqrt_ps(a);
1500}
1501
1502// double version
1503static SIMD_INLINE Vec<Double, 64> sqrt(const Vec<Double, 64> &a)
1504{
1505 return _mm512_sqrt_pd(a);
1506}
1507
1508// ---------------------------------------------------------------------------
1509// abs v
1510// ---------------------------------------------------------------------------
1511
1512// 25. Mar 25 (Jonas Keller): added abs for unsigned integers
1513
1514// unsigned integers
1515template <typename T, SIMD_ENABLE_IF(std::is_unsigned<T>::value
1516 &&std::is_integral<T>::value)>
1517static SIMD_INLINE Vec<T, 64> abs(const Vec<T, 64> &a)
1518{
1519 return a;
1520}
1521
1522static SIMD_INLINE Vec<SignedByte, 64> abs(const Vec<SignedByte, 64> &a)
1523{
1524#ifdef __AVX512BW__
1525 return _mm512_abs_epi8(a);
1526#else
1527 // non-avx512bw workaround
1528 return Vec<SignedByte, 64>(abs(a.lo()), abs(a.hi()));
1529#endif
1530}
1531
1532static SIMD_INLINE Vec<Short, 64> abs(const Vec<Short, 64> &a)
1533{
1534#ifdef __AVX512BW__
1535 return _mm512_abs_epi16(a);
1536#else
1537 // non-avx512bw workaround
1538 return Vec<Short, 64>(abs(a.lo()), abs(a.hi()));
1539#endif
1540}
1541
1542static SIMD_INLINE Vec<Int, 64> abs(const Vec<Int, 64> &a)
1543{
1544 return _mm512_abs_epi32(a);
1545}
1546
1547static SIMD_INLINE Vec<Long, 64> abs(const Vec<Long, 64> &a)
1548{
1549 return _mm512_abs_epi64(a);
1550}
1551
1552static SIMD_INLINE Vec<Float, 64> abs(const Vec<Float, 64> &a)
1553{
1554 return _mm512_abs_ps(a);
1555}
1556
1557static SIMD_INLINE Vec<Double, 64> abs(const Vec<Double, 64> &a)
1558{
1559 return _mm512_abs_pd(a);
1560}
1561
1562// ---------------------------------------------------------------------------
1563// unpacklo v (with permutex2var)
1564// ---------------------------------------------------------------------------
1565
1566// integer version
1567template <typename T>
1568static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1569 Part<0>, Bytes<1>)
1570{
1571#ifdef __AVX512VBMI__
1572 // element order high to low for idx
1573 __m512i idx = _mm512_set_epi8(
1574 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
1575 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
1576 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66,
1577 2, 65, 1, 64, 0);
1578 return _mm512_permutex2var_epi8(a, idx, b);
1579#else
1580 return x_mm512_unpacklo_epi8(x_mm512_transpose8x64_epi64(a),
1581 x_mm512_transpose8x64_epi64(b));
1582#endif
1583}
1584
1585// integer version
1586template <typename T>
1587static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1588 Part<0>, Bytes<2>)
1589{
1590#ifdef __AVX512BW__
1591 // element order high to low for idx
1592 __m512i idx =
1593 _mm512_set_epi16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40,
1594 8, 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
1595 return _mm512_permutex2var_epi16(a, idx, b);
1596#else
1597 return x_mm512_unpacklo_epi16(x_mm512_transpose8x64_epi64(a),
1598 x_mm512_transpose8x64_epi64(b));
1599#endif
1600}
1601
1602// integer version
1603template <typename T>
1604static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1605 Part<0>, Bytes<4>)
1606{
1607 __m512i idx =
1608 _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
1609 return _mm512_permutex2var_epi32(a, idx, b);
1610}
1611
1612// integer version
1613template <typename T>
1614static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1615 Part<0>, Bytes<8>)
1616{
1617 __m512i idx = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
1618 return _mm512_permutex2var_epi64(a, idx, b);
1619}
1620
1621// integer version
1622template <typename T>
1623static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1624 Part<0>, Bytes<16>)
1625{
1626 __m512i idx = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
1627 return _mm512_permutex2var_epi64(a, idx, b);
1628}
1629
1630// integer version
1631template <typename T>
1632static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1633 Part<0>, Bytes<32>)
1634{
1635 __m512i idx = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
1636 return _mm512_permutex2var_epi64(a, idx, b);
1637}
1638
1639// float version
1640static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1641 const Vec<Float, 64> &b, Part<0>,
1642 Bytes<4>)
1643{
1644 __m512i idx =
1645 _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
1646 return _mm512_permutex2var_ps(a, idx, b);
1647}
1648
1649// float version
1650static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1651 const Vec<Float, 64> &b, Part<0>,
1652 Bytes<8>)
1653{
1654 __m512i idx = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
1655 return _mm512_castpd_ps(
1656 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1657}
1658
1659// float version
1660static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1661 const Vec<Float, 64> &b, Part<0>,
1662 Bytes<16>)
1663{
1664 __m512i idx = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
1665 return _mm512_castpd_ps(
1666 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1667}
1668
1669// float version
1670static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1671 const Vec<Float, 64> &b, Part<0>,
1672 Bytes<32>)
1673{
1674 __m512i idx = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
1675 return _mm512_castpd_ps(
1676 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1677}
1678
1679// double version
1680static SIMD_INLINE Vec<Double, 64> unpack(const Vec<Double, 64> &a,
1681 const Vec<Double, 64> &b, Part<0>,
1682 Bytes<8>)
1683{
1684 __m512i idx = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
1685 return _mm512_permutex2var_pd(a, idx, b);
1686}
1687
1688// double version
1689static SIMD_INLINE Vec<Double, 64> unpack(const Vec<Double, 64> &a,
1690 const Vec<Double, 64> &b, Part<0>,
1691 Bytes<16>)
1692{
1693 __m512i idx = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
1694 return _mm512_permutex2var_pd(a, idx, b);
1695}
1696
1697// double version
1698static SIMD_INLINE Vec<Double, 64> unpack(const Vec<Double, 64> &a,
1699 const Vec<Double, 64> &b, Part<0>,
1700 Bytes<32>)
1701{
1702 __m512i idx = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
1703 return _mm512_permutex2var_pd(a, idx, b);
1704}
1705
1706// ---------------------------------------------------------------------------
1707// unpackhi v (with permutex2var)
1708// ---------------------------------------------------------------------------
1709
1710// integer version
1711template <typename T>
1712static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1713 Part<1>, Bytes<1>)
1714{
1715#ifdef __AVX512VBMI__
1716 // element order high to low for idx
1717 __m512i idx = _mm512_set_epi8(
1718 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
1719 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
1720 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
1721 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
1722 return _mm512_permutex2var_epi8(a, idx, b);
1723#else
1724 return x_mm512_unpackhi_epi8(x_mm512_transpose8x64_epi64(a),
1725 x_mm512_transpose8x64_epi64(b));
1726#endif
1727}
1728
1729// integer version
1730template <typename T>
1731static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1732 Part<1>, Bytes<2>)
1733{
1734#ifdef __AVX512BW__
1735 // element order high to low for idx
1736 __m512i idx = _mm512_set_epi16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26,
1737 57, 25, 56, 24, 55, 23, 54, 22, 53, 21, 52, 20,
1738 51, 19, 50, 18, 49, 17, 48, 16);
1739 return _mm512_permutex2var_epi16(a, idx, b);
1740#else
1741 return x_mm512_unpackhi_epi16(x_mm512_transpose8x64_epi64(a),
1742 x_mm512_transpose8x64_epi64(b));
1743#endif
1744}
1745
1746// integer version
1747template <typename T>
1748static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1749 Part<1>, Bytes<4>)
1750{
1751 __m512i idx = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10,
1752 25, 9, 24, 8);
1753 return _mm512_permutex2var_epi32(a, idx, b);
1754}
1755
1756// integer version
1757template <typename T>
1758static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1759 Part<1>, Bytes<8>)
1760{
1761 __m512i idx = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
1762 return _mm512_permutex2var_epi64(a, idx, b);
1763}
1764
1765// integer version
1766template <typename T>
1767static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1768 Part<1>, Bytes<16>)
1769{
1770 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
1771 return _mm512_permutex2var_epi64(a, idx, b);
1772}
1773
1774// integer version
1775template <typename T>
1776static SIMD_INLINE Vec<T, 64> unpack(const Vec<T, 64> &a, const Vec<T, 64> &b,
1777 Part<1>, Bytes<32>)
1778{
1779 __m512i idx = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
1780 return _mm512_permutex2var_epi64(a, idx, b);
1781}
1782
1783// float version
1784static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1785 const Vec<Float, 64> &b, Part<1>,
1786 Bytes<4>)
1787{
1788 __m512i idx = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10,
1789 25, 9, 24, 8);
1790 return _mm512_permutex2var_ps(a, idx, b);
1791}
1792
1793// float version
1794static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1795 const Vec<Float, 64> &b, Part<1>,
1796 Bytes<8>)
1797{
1798 __m512i idx = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
1799 return _mm512_castpd_ps(
1800 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1801}
1802
1803// float version
1804static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1805 const Vec<Float, 64> &b, Part<1>,
1806 Bytes<16>)
1807{
1808 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
1809 return _mm512_castpd_ps(
1810 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1811}
1812
1813// float version
1814static SIMD_INLINE Vec<Float, 64> unpack(const Vec<Float, 64> &a,
1815 const Vec<Float, 64> &b, Part<1>,
1816 Bytes<32>)
1817{
1818 __m512i idx = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
1819 return _mm512_castpd_ps(
1820 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1821}
1822
1823// double version
1824static SIMD_INLINE Vec<Double, 64> unpack(const Vec<Double, 64> &a,
1825 const Vec<Double, 64> &b, Part<1>,
1826 Bytes<8>)
1827{
1828 __m512i idx = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
1829 return _mm512_permutex2var_pd(a, idx, b);
1830}
1831
1832// double version
1833static SIMD_INLINE Vec<Double, 64> unpack(const Vec<Double, 64> &a,
1834 const Vec<Double, 64> &b, Part<1>,
1835 Bytes<16>)
1836{
1837 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
1838 return _mm512_permutex2var_pd(a, idx, b);
1839}
1840
1841// double version
1842static SIMD_INLINE Vec<Double, 64> unpack(const Vec<Double, 64> &a,
1843 const Vec<Double, 64> &b, Part<1>,
1844 Bytes<32>)
1845{
1846 __m512i idx = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
1847 return _mm512_permutex2var_pd(a, idx, b);
1848}
1849
1850// ---------------------------------------------------------------------------
1851// 128-bit-lane oriented unpacklo (with direct intrinsic calls)
1852// ---------------------------------------------------------------------------
1853
1854// contributed by Adam Marschall
1855
1856// integer version
1857template <typename T>
1858static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1859 Part<0>, Bytes<1>)
1860{
1861 return x_mm512_unpacklo_epi8(a, b);
1862}
1863
1864// integer version
1865template <typename T>
1866static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1867 Part<0>, Bytes<2>)
1868{
1869 return x_mm512_unpacklo_epi16(a, b);
1870}
1871
1872// integer version
1873template <typename T>
1874static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1875 Part<0>, Bytes<4>)
1876{
1877 return _mm512_unpacklo_epi32(a, b);
1878}
1879
1880// integer version
1881template <typename T>
1882static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1883 Part<0>, Bytes<8>)
1884{
1885 return _mm512_unpacklo_epi64(a, b);
1886}
1887
1888// integer version
1889template <typename T>
1890static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1891 Part<0>, Bytes<16>)
1892{
1893 __m512i idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
1894 return _mm512_permutex2var_epi64(a, idx, b);
1895}
1896
1897// integer version
1898template <typename T>
1899static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1900 Part<0>, Bytes<32>)
1901{
1902 return _mm512_shuffle_i32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0));
1903}
1904
1905// float version
1906static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
1907 const Vec<Float, 64> &b, Part<0>,
1908 Bytes<4>)
1909{
1910 return _mm512_unpacklo_ps(a, b);
1911}
1912
1913// float version
1914static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
1915 const Vec<Float, 64> &b, Part<0>,
1916 Bytes<8>)
1917{
1918 return _mm512_castpd_ps(
1919 _mm512_unpacklo_pd(_mm512_castps_pd(a), _mm512_castps_pd(b)));
1920}
1921
1922// float version
1923static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
1924 const Vec<Float, 64> &b, Part<0>,
1925 Bytes<16>)
1926{
1927 __m512i idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
1928 return _mm512_castpd_ps(
1929 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
1930}
1931
1932// float version
1933static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
1934 const Vec<Float, 64> &b, Part<0>,
1935 Bytes<32>)
1936{
1937 return _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0));
1938}
1939
1940// double version
1941static SIMD_INLINE Vec<Double, 64> unpack16(const Vec<Double, 64> &a,
1942 const Vec<Double, 64> &b, Part<0>,
1943 Bytes<8>)
1944{
1945 return _mm512_unpacklo_pd(a, b);
1946}
1947
1948// double version
1949static SIMD_INLINE Vec<Double, 64> unpack16(const Vec<Double, 64> &a,
1950 const Vec<Double, 64> &b, Part<0>,
1951 Bytes<16>)
1952{
1953 __m512i idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
1954 return _mm512_permutex2var_pd(a, idx, b);
1955}
1956
1957// double version
1958static SIMD_INLINE Vec<Double, 64> unpack16(const Vec<Double, 64> &a,
1959 const Vec<Double, 64> &b, Part<0>,
1960 Bytes<32>)
1961{
1962 return _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0));
1963}
1964
1965// ---------------------------------------------------------------------------
1966// 128-bit-lane oriented unpackhi v
1967// ---------------------------------------------------------------------------
1968
1969// integer version
1970template <typename T>
1971static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1972 Part<1>, Bytes<1>)
1973{
1974 return x_mm512_unpackhi_epi8(a, b);
1975}
1976
1977// integer version
1978template <typename T>
1979static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1980 Part<1>, Bytes<2>)
1981{
1982 return x_mm512_unpackhi_epi16(a, b);
1983}
1984
1985// integer version
1986template <typename T>
1987static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1988 Part<1>, Bytes<4>)
1989{
1990 return _mm512_unpackhi_epi32(a, b);
1991}
1992
1993// integer version
1994template <typename T>
1995static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
1996 Part<1>, Bytes<8>)
1997{
1998 return _mm512_unpackhi_epi64(a, b);
1999}
2000
2001// integer version
2002template <typename T>
2003static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
2004 Part<1>, Bytes<16>)
2005{
2006 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2);
2007 return _mm512_permutex2var_epi64(a, idx, b);
2008}
2009
2010// integer version
2011template <typename T>
2012static SIMD_INLINE Vec<T, 64> unpack16(const Vec<T, 64> &a, const Vec<T, 64> &b,
2013 Part<1>, Bytes<32>)
2014{
2015 return _mm512_shuffle_i32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2));
2016}
2017
2018// float version
2019static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
2020 const Vec<Float, 64> &b, Part<1>,
2021 Bytes<4>)
2022{
2023 return _mm512_unpackhi_ps(a, b);
2024}
2025
2026// float version
2027static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
2028 const Vec<Float, 64> &b, Part<1>,
2029 Bytes<8>)
2030{
2031 return _mm512_castpd_ps(
2032 _mm512_unpackhi_pd(_mm512_castps_pd(a), _mm512_castps_pd(b)));
2033}
2034
2035// float version
2036static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
2037 const Vec<Float, 64> &b, Part<1>,
2038 Bytes<16>)
2039{
2040 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2);
2041 return _mm512_castpd_ps(
2042 _mm512_permutex2var_pd(_mm512_castps_pd(a), idx, _mm512_castps_pd(b)));
2043}
2044
2045// float version
2046static SIMD_INLINE Vec<Float, 64> unpack16(const Vec<Float, 64> &a,
2047 const Vec<Float, 64> &b, Part<1>,
2048 Bytes<32>)
2049{
2050 return _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2));
2051}
2052
2053// double version
2054static SIMD_INLINE Vec<Double, 64> unpack16(const Vec<Double, 64> &a,
2055 const Vec<Double, 64> &b, Part<1>,
2056 Bytes<8>)
2057{
2058 return _mm512_unpackhi_pd(a, b);
2059}
2060
2061// double version
2062static SIMD_INLINE Vec<Double, 64> unpack16(const Vec<Double, 64> &a,
2063 const Vec<Double, 64> &b, Part<1>,
2064 Bytes<16>)
2065{
2066 __m512i idx = _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2);
2067 return _mm512_permutex2var_pd(a, idx, b);
2068}
2069
2070// double version
2071static SIMD_INLINE Vec<Double, 64> unpack16(const Vec<Double, 64> &a,
2072 const Vec<Double, 64> &b, Part<1>,
2073 Bytes<32>)
2074{
2075 return _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2));
2076}
2077
2078// ---------------------------------------------------------------------------
2079// zip v
2080// ---------------------------------------------------------------------------
2081
2082// 25. Aug 23 (Jonas Keller): Simplified zip implementation by using a single
2083// template function instead of multiple specializations.
2084
2085// a, b are passed by-value to avoid problems with identical in/out args.
2086
2087// zips blocks of NUM_ELEMS elements of type T
2088template <size_t NUM_ELEMS, typename T>
2089static SIMD_INLINE void zip(const Vec<T, 64> a, const Vec<T, 64> b,
2090 Vec<T, 64> &l, Vec<T, 64> &h)
2091{
2092 l = unpack(a, b, Part<0>(), Bytes<NUM_ELEMS * sizeof(T)>());
2093 h = unpack(a, b, Part<1>(), Bytes<NUM_ELEMS * sizeof(T)>());
2094}
2095
2096// ---------------------------------------------------------------------------
2097// zip16 (16-byte-lane oriented zip)
2098// ---------------------------------------------------------------------------
2099
2100// contributed by Adam Marschall
2101
2102// zips blocks of NUM_ELEMS elements of type T
2103template <size_t NUM_ELEMS, typename T>
2104static SIMD_INLINE void zip16(const Vec<T, 64> a, const Vec<T, 64> b,
2105 Vec<T, 64> &l, Vec<T, 64> &h)
2106{
2107 l = unpack16(a, b, Part<0>(), Bytes<NUM_ELEMS * sizeof(T)>());
2108 h = unpack16(a, b, Part<1>(), Bytes<NUM_ELEMS * sizeof(T)>());
2109}
2110
2111// ---------------------------------------------------------------------------
2112// unzip v
2113// ---------------------------------------------------------------------------
2114
2115// a, b are passed by-value to avoid problems with identical
2116// input/output args.
2117
2118// integer version
2119template <typename T>
2120static SIMD_INLINE void unzip(const Vec<T, 64> a, const Vec<T, 64> b,
2121 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<1>)
2122{
2123#ifdef __AVX512VBMI__
2124 const __m512i idxL = _mm512_set_epi8(
2125 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
2126 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
2127 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
2128 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2129 const __m512i idxH = _mm512_set_epi8(
2130 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
2131 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
2132 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
2133 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2134 l = _mm512_permutex2var_epi8(a, idxL, b);
2135 h = _mm512_permutex2var_epi8(a, idxH, b);
2136#else
2137 const __m512i mask = _mm512_set_epi8(
2138 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5,
2139 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8,
2140 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
2141 const __m512i atmp = x_mm512_shuffle_epi8(a, mask);
2142 const __m512i btmp = x_mm512_shuffle_epi8(b, mask);
2143 l = _mm512_permutex2var_epi64(
2144 atmp, _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0), btmp);
2145 h = _mm512_permutex2var_epi64(
2146 atmp, _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1), btmp);
2147#endif
2148}
2149
2150// integer version
2151template <typename T>
2152static SIMD_INLINE void unzip(const Vec<T, 64> a, const Vec<T, 64> b,
2153 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<2>)
2154{
2155#ifdef __AVX512BW__
2156 const __m512i idxL = _mm512_set_epi16(
2157 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
2158 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2159 const __m512i idxH = _mm512_set_epi16(
2160 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
2161 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2162 l = _mm512_permutex2var_epi16(a, idxL, b);
2163 h = _mm512_permutex2var_epi16(a, idxH, b);
2164#else
2165 const __m512i mask = _mm512_set_epi8(
2166 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6,
2167 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5,
2168 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
2169 const __m512i atmp = x_mm512_shuffle_epi8(a, mask);
2170 const __m512i btmp = x_mm512_shuffle_epi8(b, mask);
2171 l = _mm512_permutex2var_epi64(
2172 atmp, _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0), btmp);
2173 h = _mm512_permutex2var_epi64(
2174 atmp, _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1), btmp);
2175#endif
2176}
2177
2178// integer version
2179template <typename T>
2180static SIMD_INLINE void unzip(const Vec<T, 64> a, const Vec<T, 64> b,
2181 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<4>)
2182{
2183 const __m512i idxL =
2184 _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2185 const __m512i idxH =
2186 _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2187 l = _mm512_permutex2var_epi32(a, idxL, b);
2188 h = _mm512_permutex2var_epi32(a, idxH, b);
2189}
2190
2191// integer version
2192template <typename T>
2193static SIMD_INLINE void unzip(const Vec<T, 64> a, const Vec<T, 64> b,
2194 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<8>)
2195{
2196 const __m512i idxL = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
2197 const __m512i idxH = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
2198 l = _mm512_permutex2var_epi64(a, idxL, b);
2199 h = _mm512_permutex2var_epi64(a, idxH, b);
2200}
2201
2202// integer version
2203template <typename T>
2204static SIMD_INLINE void unzip(const Vec<T, 64> a, const Vec<T, 64> b,
2205 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<16>)
2206{
2207 const __m512i idxL = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0);
2208 const __m512i idxH = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2);
2209 l = _mm512_permutex2var_epi64(a, idxL, b);
2210 h = _mm512_permutex2var_epi64(a, idxH, b);
2211}
2212
2213// integer version
2214template <typename T>
2215static SIMD_INLINE void unzip(const Vec<T, 64> a, const Vec<T, 64> b,
2216 Vec<T, 64> &l, Vec<T, 64> &h, Bytes<32>)
2217{
2218 l = unpack(a, b, Part<0>(), Bytes<32>());
2219 h = unpack(a, b, Part<1>(), Bytes<32>());
2220}
2221
2222// float version
2223static SIMD_INLINE void unzip(const Vec<Float, 64> a, const Vec<Float, 64> b,
2224 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<4>)
2225{
2226 const __m512i idxL =
2227 _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2228 const __m512i idxH =
2229 _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2230 l = _mm512_permutex2var_ps(a, idxL, b);
2231 h = _mm512_permutex2var_ps(a, idxH, b);
2232}
2233
2234// float version
2235static SIMD_INLINE void unzip(const Vec<Float, 64> a, const Vec<Float, 64> b,
2236 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<8>)
2237{
2238 const __m512i idxL = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
2239 const __m512i idxH = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
2240 l = _mm512_castpd_ps(
2241 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxL, _mm512_castps_pd(b)));
2242 h = _mm512_castpd_ps(
2243 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxH, _mm512_castps_pd(b)));
2244}
2245
2246// float version
2247static SIMD_INLINE void unzip(const Vec<Float, 64> a, const Vec<Float, 64> b,
2248 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<16>)
2249{
2250 const __m512i idxL = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0);
2251 const __m512i idxH = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2);
2252 l = _mm512_castpd_ps(
2253 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxL, _mm512_castps_pd(b)));
2254 h = _mm512_castpd_ps(
2255 _mm512_permutex2var_pd(_mm512_castps_pd(a), idxH, _mm512_castps_pd(b)));
2256}
2257
2258// float version
2259static SIMD_INLINE void unzip(const Vec<Float, 64> a, const Vec<Float, 64> b,
2260 Vec<Float, 64> &l, Vec<Float, 64> &h, Bytes<32>)
2261{
2262 l = unpack(a, b, Part<0>(), Bytes<32>());
2263 h = unpack(a, b, Part<1>(), Bytes<32>());
2264}
2265
2266// double version
2267static SIMD_INLINE void unzip(const Vec<Double, 64> a, const Vec<Double, 64> b,
2268 Vec<Double, 64> &l, Vec<Double, 64> &h, Bytes<8>)
2269{
2270 const __m512i idxL = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
2271 const __m512i idxH = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
2272 l = _mm512_permutex2var_pd(a, idxL, b);
2273 h = _mm512_permutex2var_pd(a, idxH, b);
2274}
2275
2276// double version
2277static SIMD_INLINE void unzip(const Vec<Double, 64> a, const Vec<Double, 64> b,
2278 Vec<Double, 64> &l, Vec<Double, 64> &h, Bytes<16>)
2279{
2280 const __m512i idxL = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0);
2281 const __m512i idxH = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2);
2282 l = _mm512_permutex2var_pd(a, idxL, b);
2283 h = _mm512_permutex2var_pd(a, idxH, b);
2284}
2285
2286// ---------------------------------------------------------------------------
2287// packs v
2288// ---------------------------------------------------------------------------
2289
2290// ========== signed -> signed ==========
2291
2292static SIMD_INLINE Vec<SignedByte, 64> packs(const Vec<Short, 64> &a,
2293 const Vec<Short, 64> &b,
2294 OutputType<SignedByte>)
2295{
2296 return x_mm512_evenodd8x64_epi64(x_mm512_packs_epi16(a, b));
2297}
2298
2299static SIMD_INLINE Vec<Short, 64> packs(const Vec<Int, 64> &a,
2300 const Vec<Int, 64> &b,
2301 OutputType<Short>)
2302{
2303 return x_mm512_evenodd8x64_epi64(x_mm512_packs_epi32(a, b));
2304}
2305
2306static SIMD_INLINE Vec<Short, 64> packs(const Vec<Float, 64> &a,
2307 const Vec<Float, 64> &b,
2308 OutputType<Short>)
2309{
2310 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2311 OutputType<Short>());
2312}
2313
2314static SIMD_INLINE Vec<Int, 64> packs(const Vec<Long, 64> &a,
2315 const Vec<Long, 64> &b, OutputType<Int>)
2316{
2317 return _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi64_epi32(a)),
2318 _mm512_cvtsepi64_epi32(b), 1);
2319}
2320
2321static SIMD_INLINE Vec<Float, 64> packs(const Vec<Long, 64> &a,
2322 const Vec<Long, 64> &b,
2323 OutputType<Float>)
2324{
2325#ifdef __AVX512DQ__
2326 const __m256d low = _mm256_castps_pd(_mm512_cvtepi64_ps(a));
2327 const __m256d high = _mm256_castps_pd(_mm512_cvtepi64_ps(b));
2328#else
2329 const __m256d low =
2330 _mm256_castps_pd(_mm512_cvtpd_ps(cvts(a, OutputType<Double>())));
2331 const __m256d high =
2332 _mm256_castps_pd(_mm512_cvtpd_ps(cvts(b, OutputType<Double>())));
2333#endif
2334 return _mm512_castpd_ps(
2335 _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1));
2336}
2337
2338static SIMD_INLINE Vec<Float, 64> packs(const Vec<Double, 64> &a,
2339 const Vec<Double, 64> &b,
2340 OutputType<Float>)
2341{
2342 const __m256d low = _mm256_castps_pd(_mm512_cvtpd_ps(a));
2343 const __m256d high = _mm256_castps_pd(_mm512_cvtpd_ps(b));
2344 return _mm512_castpd_ps(
2345 _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1));
2346}
2347
2348static SIMD_INLINE Vec<Int, 64> packs(const Vec<Double, 64> &a,
2349 const Vec<Double, 64> &b, OutputType<Int>)
2350{
2351 const __m512d clip = _mm512_set1_pd(std::numeric_limits<Int>::max());
2352 const __m256i low = _mm512_cvtpd_epi32(_mm512_min_pd(clip, a));
2353 const __m256i high = _mm512_cvtpd_epi32(_mm512_min_pd(clip, b));
2354 return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
2355}
2356
2357// ========== unsigned -> unsigned ==========
2358
2359// non-avx512bw workaround
2360static SIMD_INLINE Vec<Byte, 64> packs(const Vec<Word, 64> &a,
2361 const Vec<Word, 64> &b, OutputType<Byte>)
2362{
2363 const auto aSaturated = min(a, Vec<Word, 64>(_mm512_set1_epi16(0xff)));
2364 const auto bSaturated = min(b, Vec<Word, 64>(_mm512_set1_epi16(0xff)));
2365 return x_mm512_evenodd8x64_epi64(
2366 x_mm512_packus_epi16(aSaturated, bSaturated));
2367}
2368
2369// ========== signed -> unsigned ==========
2370
2371// non-avx512bw workaround
2372static SIMD_INLINE Vec<Byte, 64> packs(const Vec<Short, 64> &a,
2373 const Vec<Short, 64> &b,
2374 OutputType<Byte>)
2375{
2376 return x_mm512_evenodd8x64_epi64(x_mm512_packus_epi16(a, b));
2377}
2378
2379// non-avx512bw workaround
2380static SIMD_INLINE Vec<Word, 64> packs(const Vec<Int, 64> &a,
2381 const Vec<Int, 64> &b, OutputType<Word>)
2382{
2383 return x_mm512_evenodd8x64_epi64(x_mm512_packus_epi32(a, b));
2384}
2385
2386static SIMD_INLINE Vec<Word, 64> packs(const Vec<Float, 64> &a,
2387 const Vec<Float, 64> &b,
2388 OutputType<Word>)
2389{
2390 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2391 OutputType<Word>());
2392}
2393
2394// ========== unsigned -> signed ==========
2395
2396// non-avx512bw workaround
2397static SIMD_INLINE Vec<SignedByte, 64> packs(const Vec<Word, 64> &a,
2398 const Vec<Word, 64> &b,
2399 OutputType<SignedByte>)
2400{
2401 return x_mm512_evenodd8x64_epi64(
2402 x_mm512_packs_epi16(min(a, Vec<Word, 64>(_mm512_set1_epi16(0x7f))),
2403 min(b, Vec<Word, 64>(_mm512_set1_epi16(0x7f)))));
2404}
2405
2406// ---------------------------------------------------------------------------
2407// generalized extend: no stage v
2408// ---------------------------------------------------------------------------
2409
2410// combinations:
2411// - signed -> extended signed (sign extension)
2412// - unsigned -> extended unsigned (zero extension)
2413// - unsigned -> extended signed (zero extension)
2414// - signed -> extended unsigned (saturation and zero extension)
2415
2416// same types
2417template <typename T>
2418static SIMD_INLINE void extend(const Vec<T, 64> &vIn, Vec<T, 64> vOut[1])
2419{
2420 vOut[0] = vIn;
2421}
2422
2423// same size, different types
2424
2425static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2426 Vec<Byte, 64> vOut[1])
2427{
2428 vOut[0] = max(vIn, Vec<SignedByte, 64>(_mm512_setzero_si512()));
2429}
2430
2431static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn,
2432 Vec<SignedByte, 64> vOut[1])
2433{
2434 vOut[0] = min(vIn, Vec<Byte, 64>(_mm512_set1_epi8(0x7f)));
2435}
2436
2437static SIMD_INLINE void extend(const Vec<Short, 64> &vIn, Vec<Word, 64> vOut[1])
2438{
2439 vOut[0] = max(vIn, Vec<Short, 64>(_mm512_setzero_si512()));
2440}
2441
2442static SIMD_INLINE void extend(const Vec<Word, 64> &vIn, Vec<Short, 64> vOut[1])
2443{
2444 vOut[0] = min(vIn, Vec<Word, 64>(_mm512_set1_epi16(0x7fff)));
2445}
2446
2447// ---------------------------------------------------------------------------
2448// generalized extend: single stage v
2449// ---------------------------------------------------------------------------
2450
2451// signed -> signed
2452
2453static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2454 Vec<Short, 64> vOut[2])
2455{
2456#ifdef __AVX512BW__
2457 vOut[0] = _mm512_cvtepi8_epi16(vIn.lo());
2458 vOut[1] = _mm512_cvtepi8_epi16(vIn.hi());
2459#else
2460 {
2461 const __m256i lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vIn.lo()));
2462 const __m256i hi =
2463 _mm256_cvtepi8_epi16(_mm256_extractf128_si256(vIn.lo(), 1));
2464 vOut[0] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2465 }
2466 {
2467 const __m256i lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vIn.hi()));
2468 const __m256i hi =
2469 _mm256_cvtepi8_epi16(_mm256_extractf128_si256(vIn.hi(), 1));
2470 vOut[1] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2471 }
2472#endif
2473}
2474
2475static SIMD_INLINE void extend(const Vec<Short, 64> &vIn, Vec<Int, 64> vOut[2])
2476{
2477 vOut[0] = _mm512_cvtepi16_epi32(vIn.lo());
2478 vOut[1] = _mm512_cvtepi16_epi32(vIn.hi());
2479}
2480
2481static SIMD_INLINE void extend(const Vec<Short, 64> &vIn,
2482 Vec<Float, 64> vOut[2])
2483{
2484 vOut[0] = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vIn.lo()));
2485 vOut[1] = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vIn.hi()));
2486}
2487
2488static SIMD_INLINE void extend(const Vec<Int, 64> &vIn, Vec<Long, 64> vecOut[2])
2489{
2490 vecOut[0] = _mm512_cvtepi32_epi64(vIn.lo());
2491 vecOut[1] = _mm512_cvtepi32_epi64(vIn.hi());
2492}
2493
2494static SIMD_INLINE void extend(const Vec<Int, 64> &vIn,
2495 Vec<Double, 64> vecOut[2])
2496{
2497 vecOut[0] = _mm512_cvtepi32_pd(vIn.lo());
2498 vecOut[1] = _mm512_cvtepi32_pd(vIn.hi());
2499}
2500
2501static SIMD_INLINE void extend(const Vec<Float, 64> &vIn,
2502 Vec<Long, 64> vecOut[2])
2503{
2504 const Vec<Float, 64> clipped =
2505 _mm512_min_ps(_mm512_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT64), vIn);
2506#ifdef __AVX512DQ__
2507 vecOut[0] = _mm512_cvtps_epi64(clipped.lo());
2508 vecOut[1] = _mm512_cvtps_epi64(clipped.hi());
2509#else
2510 vecOut[0] = cvts(_mm512_cvtps_pd(clipped.lo()), OutputType<Long>());
2511 vecOut[1] = cvts(_mm512_cvtps_pd(clipped.hi()), OutputType<Long>());
2512#endif
2513}
2514
2515static SIMD_INLINE void extend(const Vec<Float, 64> &vIn,
2516 Vec<Double, 64> vecOut[2])
2517{
2518 vecOut[0] = _mm512_cvtps_pd(vIn.lo());
2519 vecOut[1] = _mm512_cvtps_pd(vIn.hi());
2520}
2521
2522// unsigned -> unsigned
2523
2524static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn, Vec<Word, 64> vOut[2])
2525{
2526 // there's no _mm512_cvtepu8_epu16()
2527 vOut[0] = unpack(vIn, setzero(OutputType<Byte>(), Integer<64>()), Part<0>(),
2528 Bytes<1>());
2529 vOut[1] = unpack(vIn, setzero(OutputType<Byte>(), Integer<64>()), Part<1>(),
2530 Bytes<1>());
2531}
2532
2533// unsigned -> signed
2534
2535static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn, Vec<Short, 64> vOut[2])
2536{
2537#ifdef __AVX512BW__
2538 vOut[0] = _mm512_cvtepu8_epi16(vIn.lo());
2539 vOut[1] = _mm512_cvtepu8_epi16(vIn.hi());
2540#else
2541 {
2542 const __m256i lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vIn.lo()));
2543 const __m256i hi =
2544 _mm256_cvtepu8_epi16(_mm256_extractf128_si256(vIn.lo(), 1));
2545 vOut[0] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2546 }
2547 {
2548 const __m256i lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vIn.hi()));
2549 const __m256i hi =
2550 _mm256_cvtepu8_epi16(_mm256_extractf128_si256(vIn.hi(), 1));
2551 vOut[1] = _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
2552 }
2553#endif
2554}
2555
2556static SIMD_INLINE void extend(const Vec<Word, 64> &vIn, Vec<Int, 64> vOut[2])
2557{
2558 vOut[0] = _mm512_cvtepu16_epi32(vIn.lo());
2559 vOut[1] = _mm512_cvtepu16_epi32(vIn.hi());
2560}
2561
2562static SIMD_INLINE void extend(const Vec<Word, 64> &vIn, Vec<Float, 64> vOut[2])
2563{
2564 vOut[0] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(vIn.lo()));
2565 vOut[1] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(vIn.hi()));
2566}
2567
2568// signed -> unsigned
2569
2570static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2571 Vec<Word, 64> vOut[2])
2572{
2573 const Vec<SignedByte, 64> saturated =
2574 max(vIn, Vec<SignedByte, 64>(_mm512_setzero_si512()));
2575 vOut[0] = unpack(saturated, setzero(OutputType<SignedByte>(), Integer<64>()),
2576 Part<0>(), Bytes<1>());
2577 vOut[1] = unpack(saturated, setzero(OutputType<SignedByte>(), Integer<64>()),
2578 Part<1>(), Bytes<1>());
2579} // namespace base
2580
2581// ---------------------------------------------------------------------------
2582// generalized extend: two stages v
2583// ---------------------------------------------------------------------------
2584
2585// signed -> signed
2586
2587static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2588 Vec<Int, 64> vOut[4])
2589{
2590 vOut[0] = _mm512_cvtepi8_epi32(_mm256_castsi256_si128(vIn.lo()));
2591 vOut[1] = _mm512_cvtepi8_epi32(_mm256_extractf128_si256(vIn.lo(), 1));
2592 vOut[2] = _mm512_cvtepi8_epi32(_mm256_castsi256_si128(vIn.hi()));
2593 vOut[3] = _mm512_cvtepi8_epi32(_mm256_extractf128_si256(vIn.hi(), 1));
2594}
2595
2596static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2597 Vec<Float, 64> vOut[4])
2598{
2599 Vec<Int, 64> vTmp[4];
2600 extend(vIn, vTmp);
2601 for (size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
2602}
2603
2604static SIMD_INLINE void extend(const Vec<Short, 64> &vIn, Vec<Long, 64> vOut[4])
2605{
2606 vOut[0] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 0));
2607 vOut[1] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2608 vOut[2] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2609 vOut[3] = _mm512_cvtepi16_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2610}
2611
2612static SIMD_INLINE void extend(const Vec<Short, 64> &vIn,
2613 Vec<Double, 64> vOut[4])
2614{
2615 vOut[0] = _mm512_cvtepi32_pd(
2616 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 0)));
2617 vOut[1] = _mm512_cvtepi32_pd(
2618 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 1)));
2619 vOut[2] = _mm512_cvtepi32_pd(
2620 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 2)));
2621 vOut[3] = _mm512_cvtepi32_pd(
2622 _mm256_cvtepi16_epi32(_mm512_extracti32x4_epi32(vIn, 3)));
2623}
2624
2625// unsigned -> signed
2626
2627static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn, Vec<Int, 64> vOut[4])
2628{
2629 vOut[0] = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(vIn.lo()));
2630 vOut[1] = _mm512_cvtepu8_epi32(_mm256_extractf128_si256(vIn.lo(), 1));
2631 vOut[2] = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(vIn.hi()));
2632 vOut[3] = _mm512_cvtepu8_epi32(_mm256_extractf128_si256(vIn.hi(), 1));
2633}
2634
2635static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn, Vec<Float, 64> vOut[4])
2636{
2637 Vec<Int, 64> vTmp[4];
2638 extend(vIn, vTmp);
2639 for (size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
2640}
2641
2642static SIMD_INLINE void extend(const Vec<Word, 64> &vIn, Vec<Long, 64> vOut[4])
2643{
2644 vOut[0] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 0));
2645 vOut[1] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2646 vOut[2] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2647 vOut[3] = _mm512_cvtepu16_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2648}
2649
2650static SIMD_INLINE void extend(const Vec<Word, 64> &vIn,
2651 Vec<Double, 64> vOut[4])
2652{
2653 vOut[0] = _mm512_cvtepi32_pd(
2654 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 0)));
2655 vOut[1] = _mm512_cvtepi32_pd(
2656 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 1)));
2657 vOut[2] = _mm512_cvtepi32_pd(
2658 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 2)));
2659 vOut[3] = _mm512_cvtepi32_pd(
2660 _mm256_cvtepu16_epi32(_mm512_extracti32x4_epi32(vIn, 3)));
2661}
2662
2663// ---------------------------------------------------------------------------
2664// generalized extend: three stages
2665// ---------------------------------------------------------------------------
2666
2667// signed -> signed
2668
2669static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2670 Vec<Long, 64> vOut[8])
2671{
2672 vOut[0] = _mm512_cvtepi8_epi64(_mm512_castsi512_si128(vIn));
2673 vOut[1] =
2674 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_castsi512_si128(vIn), 8));
2675 vOut[2] = _mm512_cvtepi8_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2676 vOut[3] =
2677 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 1), 8));
2678 vOut[4] = _mm512_cvtepi8_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2679 vOut[5] =
2680 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 2), 8));
2681 vOut[6] = _mm512_cvtepi8_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2682 vOut[7] =
2683 _mm512_cvtepi8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 3), 8));
2684}
2685
2686static SIMD_INLINE void extend(const Vec<SignedByte, 64> &vIn,
2687 Vec<Double, 64> vOut[8])
2688{
2689 const __m128i vIn128[4] = {
2690 _mm512_extracti32x4_epi32(vIn, 0),
2691 _mm512_extracti32x4_epi32(vIn, 1),
2692 _mm512_extracti32x4_epi32(vIn, 2),
2693 _mm512_extracti32x4_epi32(vIn, 3),
2694 };
2695
2696 for (size_t i = 0; i < 4; i++) {
2697 vOut[i * 2 + 0] = _mm512_cvtepi32_pd(_mm256_cvtepi8_epi32(vIn128[i]));
2698 vOut[i * 2 + 1] =
2699 _mm512_cvtepi32_pd(_mm256_cvtepi8_epi32(_mm_srli_si128(vIn128[i], 8)));
2700 }
2701}
2702
2703// unsigned -> signed
2704
2705static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn, Vec<Long, 64> vOut[8])
2706{
2707 vOut[0] = _mm512_cvtepu8_epi64(_mm512_castsi512_si128(vIn));
2708 vOut[1] =
2709 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_castsi512_si128(vIn), 8));
2710 vOut[2] = _mm512_cvtepu8_epi64(_mm512_extracti32x4_epi32(vIn, 1));
2711 vOut[3] =
2712 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 1), 8));
2713 vOut[4] = _mm512_cvtepu8_epi64(_mm512_extracti32x4_epi32(vIn, 2));
2714 vOut[5] =
2715 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 2), 8));
2716 vOut[6] = _mm512_cvtepu8_epi64(_mm512_extracti32x4_epi32(vIn, 3));
2717 vOut[7] =
2718 _mm512_cvtepu8_epi64(_mm_srli_si128(_mm512_extracti32x4_epi32(vIn, 3), 8));
2719}
2720
2721static SIMD_INLINE void extend(const Vec<Byte, 64> &vIn,
2722 Vec<Double, 64> vOut[8])
2723{
2724 const __m128i vIn128[4] = {
2725 _mm512_extracti32x4_epi32(vIn, 0),
2726 _mm512_extracti32x4_epi32(vIn, 1),
2727 _mm512_extracti32x4_epi32(vIn, 2),
2728 _mm512_extracti32x4_epi32(vIn, 3),
2729 };
2730
2731 for (size_t i = 0; i < 4; i++) {
2732 vOut[i * 2 + 0] = _mm512_cvtepi32_pd(_mm256_cvtepu8_epi32(vIn128[i]));
2733 vOut[i * 2 + 1] =
2734 _mm512_cvtepi32_pd(_mm256_cvtepu8_epi32(_mm_srli_si128(vIn128[i], 8)));
2735 }
2736}
2737
2738// ---------------------------------------------------------------------------
2739// generalized extend: special case int <-> float, long <-> double
2740// ---------------------------------------------------------------------------
2741
2742template <typename Tout, typename Tin,
2743 SIMD_ENABLE_IF(sizeof(Tin) == sizeof(Tout)),
2744 SIMD_ENABLE_IF(std::is_floating_point<Tin>::value !=
2745 std::is_floating_point<Tout>::value)>
2746static SIMD_INLINE void extend(const Vec<Tin, 64> &vIn, Vec<Tout, 64> vOut[1])
2747{
2748 vOut[0] = cvts(vIn, OutputType<Tout>());
2749}
2750
2751// ---------------------------------------------------------------------------
2752// srai v
2753// ---------------------------------------------------------------------------
2754
2755#ifdef __AVX512BW__
2756// 16. Oct 22 (Jonas Keller): added missing Byte and SignedByte versions
2757
2758template <size_t COUNT>
2759static SIMD_INLINE Vec<Byte, 64> srai(const Vec<Byte, 64> &a)
2760{
2761 const __m512i odd = _mm512_srai_epi16(a, vec::min(COUNT, 7ul));
2762 const __m512i even =
2763 _mm512_srai_epi16(_mm512_slli_epi16(a, 8), vec::min(COUNT, 7ul) + 8);
2764 const __mmask64 mask = __mmask64(0x5555555555555555);
2765 return _mm512_mask_blend_epi8(mask, odd, even);
2766}
2767
2768template <size_t COUNT>
2769static SIMD_INLINE Vec<SignedByte, 64> srai(const Vec<SignedByte, 64> &a)
2770{
2771 const __m512i odd = _mm512_srai_epi16(a, vec::min(COUNT, 7ul));
2772 const __m512i even =
2773 _mm512_srai_epi16(_mm512_slli_epi16(a, 8), vec::min(COUNT, 7ul) + 8);
2774 const __mmask64 mask = __mmask64(0x5555555555555555);
2775 return _mm512_mask_blend_epi8(mask, odd, even);
2776}
2777
2778template <size_t COUNT>
2779static SIMD_INLINE Vec<Word, 64> srai(const Vec<Word, 64> &a)
2780{
2781 return _mm512_srai_epi16(a, vec::min(COUNT, 15ul));
2782}
2783
2784template <size_t COUNT>
2785static SIMD_INLINE Vec<Short, 64> srai(const Vec<Short, 64> &a)
2786{
2787 return _mm512_srai_epi16(a, vec::min(COUNT, 15ul));
2788}
2789
2790#else
2791
2792// non-avx512bw workaround
2793template <size_t COUNT, typename T>
2794static SIMD_INLINE Vec<T, 64> srai(const Vec<T, 64> &a)
2795{
2796 return Vec<T, 64>(srai<COUNT>(a.lo()), srai<COUNT>(a.hi()));
2797}
2798
2799#endif
2800
2801template <size_t COUNT>
2802static SIMD_INLINE Vec<Int, 64> srai(const Vec<Int, 64> &a)
2803{
2804 return _mm512_srai_epi32(a, vec::min(COUNT, 31ul));
2805}
2806
2807template <size_t COUNT>
2808static SIMD_INLINE Vec<Long, 64> srai(const Vec<Long, 64> &a)
2809{
2810 return _mm512_srai_epi64(a, vec::min(COUNT, 63ul));
2811}
2812
2813// ---------------------------------------------------------------------------
2814// srli v
2815// ---------------------------------------------------------------------------
2816
2817template <size_t COUNT>
2818static SIMD_INLINE Vec<Byte, 64> srli(const Vec<Byte, 64> &a)
2819{
2820 SIMD_IF_CONSTEXPR (COUNT < 8) {
2821 // https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2822 // License: not specified
2823 return _mm512_and_si512(_mm512_set1_epi8((int8_t) (0xff >> COUNT)),
2824 _mm512_srli_epi32(a, COUNT));
2825 } else {
2826 return _mm512_setzero_si512();
2827 }
2828}
2829
2830template <size_t COUNT>
2831static SIMD_INLINE Vec<SignedByte, 64> srli(const Vec<SignedByte, 64> &a)
2832{
2833 SIMD_IF_CONSTEXPR (COUNT < 8) {
2834 // https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2835 // License: not specified
2836 return _mm512_and_si512(_mm512_set1_epi8((int8_t) (0xff >> COUNT)),
2837 _mm512_srli_epi32(a, COUNT));
2838 } else {
2839 return _mm512_setzero_si512();
2840 }
2841}
2842
2843template <size_t COUNT>
2844static SIMD_INLINE Vec<Word, 64> srli(const Vec<Word, 64> &a)
2845{
2846 SIMD_IF_CONSTEXPR (COUNT < 32) {
2847#ifdef __AVX512BW__
2848 return _mm512_srli_epi16(a, COUNT);
2849#else
2850 return _mm512_and_si512(_mm512_set1_epi16((int16_t) (0xffff >> COUNT)),
2851 _mm512_srli_epi32(a, COUNT));
2852#endif
2853 } else {
2854 return _mm512_setzero_si512();
2855 }
2856}
2857
2858template <size_t COUNT>
2859static SIMD_INLINE Vec<Short, 64> srli(const Vec<Short, 64> &a)
2860{
2861 SIMD_IF_CONSTEXPR (COUNT < 32) {
2862#ifdef __AVX512BW__
2863 return _mm512_srli_epi16(a, COUNT);
2864#else
2865 return _mm512_and_si512(_mm512_set1_epi16((int16_t) (0xffff >> COUNT)),
2866 _mm512_srli_epi32(a, COUNT));
2867#endif
2868 } else {
2869 return _mm512_setzero_si512();
2870 }
2871}
2872
2873template <size_t COUNT>
2874static SIMD_INLINE Vec<Int, 64> srli(const Vec<Int, 64> &a)
2875{
2876 SIMD_IF_CONSTEXPR (COUNT < 32) {
2877 return _mm512_srli_epi32(a, COUNT);
2878 } else {
2879 return _mm512_setzero_si512();
2880 }
2881}
2882
2883template <size_t COUNT>
2884static SIMD_INLINE Vec<Long, 64> srli(const Vec<Long, 64> &a)
2885{
2886 SIMD_IF_CONSTEXPR (COUNT < 64) {
2887 return _mm512_srli_epi64(a, COUNT);
2888 } else {
2889 return _mm512_setzero_si512();
2890 }
2891}
2892
2893// ---------------------------------------------------------------------------
2894// slli v
2895// ---------------------------------------------------------------------------
2896
2897template <size_t COUNT>
2898static SIMD_INLINE Vec<Byte, 64> slli(const Vec<Byte, 64> &a)
2899{
2900 SIMD_IF_CONSTEXPR (COUNT < 8) {
2901 // https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2902 // License: not specified
2903 return _mm512_and_si512(
2904 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2905 _mm512_slli_epi32(a, COUNT));
2906 } else {
2907 return _mm512_setzero_si512();
2908 }
2909}
2910
2911template <size_t COUNT>
2912static SIMD_INLINE Vec<SignedByte, 64> slli(const Vec<SignedByte, 64> &a)
2913{
2914 SIMD_IF_CONSTEXPR (COUNT < 8) {
2915 // https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2916 // License: not specified
2917 return _mm512_and_si512(
2918 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2919 _mm512_slli_epi32(a, COUNT));
2920 } else {
2921 return _mm512_setzero_si512();
2922 }
2923}
2924
2925template <size_t COUNT>
2926static SIMD_INLINE Vec<Word, 64> slli(const Vec<Word, 64> &a)
2927{
2928 SIMD_IF_CONSTEXPR (COUNT < 16) {
2929#ifdef __AVX512BW__
2930 return _mm512_slli_epi16(a, COUNT);
2931#else
2932 return _mm512_and_si512(
2933 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << COUNT))),
2934 _mm512_slli_epi32(a, COUNT));
2935#endif
2936 } else {
2937 return _mm512_setzero_si512();
2938 }
2939}
2940
2941template <size_t COUNT>
2942static SIMD_INLINE Vec<Short, 64> slli(const Vec<Short, 64> &a)
2943{
2944 SIMD_IF_CONSTEXPR (COUNT < 16) {
2945#ifdef __AVX512BW__
2946 return _mm512_slli_epi16(a, COUNT);
2947#else
2948 return _mm512_and_si512(
2949 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << COUNT))),
2950 _mm512_slli_epi32(a, COUNT));
2951#endif
2952 } else {
2953 return _mm512_setzero_si512();
2954 }
2955}
2956
2957template <size_t COUNT>
2958static SIMD_INLINE Vec<Int, 64> slli(const Vec<Int, 64> &a)
2959{
2960 SIMD_IF_CONSTEXPR (COUNT < 32) {
2961 return _mm512_slli_epi32(a, COUNT);
2962 } else {
2963 return _mm512_setzero_si512();
2964 }
2965}
2966
2967template <size_t COUNT>
2968static SIMD_INLINE Vec<Long, 64> slli(const Vec<Long, 64> &a)
2969{
2970 SIMD_IF_CONSTEXPR (COUNT < 64) {
2971 return _mm512_slli_epi64(a, COUNT);
2972 } else {
2973 return _mm512_setzero_si512();
2974 }
2975}
2976
2977// 19. Dec 22 (Jonas Keller): added sra, srl and sll functions
2978
2979// ---------------------------------------------------------------------------
2980// sra
2981// ---------------------------------------------------------------------------
2982
2983#ifdef __AVX512BW__
2984
2985static SIMD_INLINE Vec<Byte, 64> sra(const Vec<Byte, 64> &a,
2986 const uint8_t count)
2987{
2988 if (count >= 8) {
2989 // result should be all ones if a is negative, all zeros otherwise
2990 return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(a, _mm512_setzero_si512()));
2991 }
2992 __m512i odd = _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
2993 __m512i even =
2994 _mm512_sra_epi16(_mm512_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
2995 __mmask64 mask = __mmask64(0x5555555555555555);
2996 return _mm512_mask_blend_epi8(mask, odd, even);
2997}
2998
2999static SIMD_INLINE Vec<SignedByte, 64> sra(const Vec<SignedByte, 64> &a,
3000 const uint8_t count)
3001{
3002 if (count >= 8) {
3003 // result should be all ones if a is negative, all zeros otherwise
3004 return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(a, _mm512_setzero_si512()));
3005 }
3006 __m512i odd = _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
3007 __m512i even =
3008 _mm512_sra_epi16(_mm512_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
3009 __mmask64 mask = __mmask64(0x5555555555555555);
3010 return _mm512_mask_blend_epi8(mask, odd, even);
3011}
3012
3013static SIMD_INLINE Vec<Word, 64> sra(const Vec<Word, 64> &a,
3014 const uint8_t count)
3015{
3016 return _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
3017}
3018
3019static SIMD_INLINE Vec<Short, 64> sra(const Vec<Short, 64> &a,
3020 const uint8_t count)
3021{
3022 return _mm512_sra_epi16(a, _mm_cvtsi32_si128(count));
3023}
3024
3025#else
3026
3027// non-avx512bw workaround
3028template <typename T>
3029static SIMD_INLINE Vec<T, 64> sra(const Vec<T, 64> &a, const uint8_t count)
3030{
3031 return Vec<T, 64>(sra(a.lo(), count), sra(a.hi(), count));
3032}
3033
3034#endif
3035
3036static SIMD_INLINE Vec<Int, 64> sra(const Vec<Int, 64> &a, const uint8_t count)
3037{
3038 return _mm512_sra_epi32(a, _mm_cvtsi32_si128(count));
3039}
3040
3041static SIMD_INLINE Vec<Long, 64> sra(const Vec<Long, 64> &a,
3042 const uint8_t count)
3043{
3044 return _mm512_sra_epi64(a, _mm_cvtsi32_si128(count));
3045}
3046
3047// ---------------------------------------------------------------------------
3048// srl
3049// ---------------------------------------------------------------------------
3050
3051static SIMD_INLINE Vec<Byte, 64> srl(const Vec<Byte, 64> &a,
3052 const uint8_t count)
3053{
3054 return _mm512_and_si512(_mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3055 _mm512_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3056}
3057
3058static SIMD_INLINE Vec<SignedByte, 64> srl(const Vec<SignedByte, 64> &a,
3059 const uint8_t count)
3060{
3061 return _mm512_and_si512(_mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3062 _mm512_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3063}
3064
3065static SIMD_INLINE Vec<Word, 64> srl(const Vec<Word, 64> &a,
3066 const uint8_t count)
3067{
3068#ifdef __AVX512BW__
3069 return _mm512_srl_epi16(a, _mm_cvtsi32_si128(count));
3070#else
3071 return _mm512_and_si512(
3072 _mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3073 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff >> count)));
3074#endif
3075}
3076
3077static SIMD_INLINE Vec<Short, 64> srl(const Vec<Short, 64> &a,
3078 const uint8_t count)
3079{
3080#ifdef __AVX512BW__
3081 return _mm512_srl_epi16(a, _mm_cvtsi32_si128(count));
3082#else
3083 return _mm512_and_si512(
3084 _mm512_srl_epi32(a, _mm_cvtsi32_si128(count)),
3085 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff >> count)));
3086#endif
3087}
3088
3089static SIMD_INLINE Vec<Int, 64> srl(const Vec<Int, 64> &a, const uint8_t count)
3090{
3091 return _mm512_srl_epi32(a, _mm_cvtsi32_si128(count));
3092}
3093
3094static SIMD_INLINE Vec<Long, 64> srl(const Vec<Long, 64> &a,
3095 const uint8_t count)
3096{
3097 return _mm512_srl_epi64(a, _mm_cvtsi32_si128(count));
3098}
3099
3100// ---------------------------------------------------------------------------
3101// sll
3102// ---------------------------------------------------------------------------
3103
3104static SIMD_INLINE Vec<Byte, 64> sll(const Vec<Byte, 64> &a,
3105 const uint8_t count)
3106{
3107 return _mm512_and_si512(
3108 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3109 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3110}
3111
3112static SIMD_INLINE Vec<SignedByte, 64> sll(const Vec<SignedByte, 64> &a,
3113 const uint8_t count)
3114{
3115 return _mm512_and_si512(
3116 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3117 _mm512_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3118}
3119
3120static SIMD_INLINE Vec<Word, 64> sll(const Vec<Word, 64> &a,
3121 const uint8_t count)
3122{
3123#ifdef __AVX512BW__
3124 return _mm512_sll_epi16(a, _mm_cvtsi32_si128(count));
3125#else
3126 return _mm512_and_si512(
3127 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3128 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << count))));
3129#endif
3130}
3131
3132static SIMD_INLINE Vec<Short, 64> sll(const Vec<Short, 64> &a,
3133 const uint8_t count)
3134{
3135#ifdef __AVX512BW__
3136 return _mm512_sll_epi16(a, _mm_cvtsi32_si128(count));
3137#else
3138 return _mm512_and_si512(
3139 _mm512_sll_epi32(a, _mm_cvtsi32_si128(count)),
3140 _mm512_set1_epi16((int16_t) (uint16_t) (0xffff & (0xffff << count))));
3141#endif
3142}
3143
3144static SIMD_INLINE Vec<Int, 64> sll(const Vec<Int, 64> &a, const uint8_t count)
3145{
3146 return _mm512_sll_epi32(a, _mm_cvtsi32_si128(count));
3147}
3148
3149static SIMD_INLINE Vec<Long, 64> sll(const Vec<Long, 64> &a,
3150 const uint8_t count)
3151{
3152 return _mm512_sll_epi64(a, _mm_cvtsi32_si128(count));
3153}
3154
3155// 05. Aug 22 (Jonas Keller):
3156// Improved implementation of hadd, hadds, hsub and hsubs,
3157// implementation does not use emulation via AVX anymore.
3158// Byte and SignedByte are now supported as well.
3159// The new implementation is faster for Int and Float, but
3160// slower for Word and Short for some reason.
3161
3162// ---------------------------------------------------------------------------
3163// hadd v
3164// ---------------------------------------------------------------------------
3165
3166template <typename T>
3167static SIMD_INLINE Vec<T, 64> hadd(const Vec<T, 64> &a, const Vec<T, 64> &b)
3168{
3169 Vec<T, 64> x, y;
3170 unzip<1>(a, b, x, y);
3171 return add(x, y);
3172}
3173
3174// ---------------------------------------------------------------------------
3175// hadds v
3176// ---------------------------------------------------------------------------
3177
3178template <typename T>
3179static SIMD_INLINE Vec<T, 64> hadds(const Vec<T, 64> &a, const Vec<T, 64> &b)
3180{
3181 Vec<T, 64> x, y;
3182 unzip<1>(a, b, x, y);
3183 return adds(x, y);
3184}
3185
3186// ---------------------------------------------------------------------------
3187// hsub v
3188// ---------------------------------------------------------------------------
3189
3190template <typename T>
3191static SIMD_INLINE Vec<T, 64> hsub(const Vec<T, 64> &a, const Vec<T, 64> &b)
3192{
3193 Vec<T, 64> x, y;
3194 unzip<1>(a, b, x, y);
3195 return sub(x, y);
3196}
3197
3198// ---------------------------------------------------------------------------
3199// hsubs v
3200// ---------------------------------------------------------------------------
3201
3202template <typename T>
3203static SIMD_INLINE Vec<T, 64> hsubs(const Vec<T, 64> &a, const Vec<T, 64> &b)
3204{
3205 Vec<T, 64> x, y;
3206 unzip<1>(a, b, x, y);
3207 return subs(x, y);
3208}
3209
3210// ---------------------------------------------------------------------------
3211// permute_64_16: permutation of 128-bit lanes, two sources v
3212// ---------------------------------------------------------------------------
3213
3214// template parameter:
3215// - ABi (0/1): select lane i from a (0) or from b (1)
3216// - Ii (0..3): select lane i from index Ii in either a or b
3217
3218template <size_t AB0, size_t I0, size_t AB1, size_t I1, size_t AB2, size_t I2,
3219 size_t AB3, size_t I3, typename T>
3220static SIMD_INLINE Vec<T, 64> permute_64_16(const Vec<T, 64> &a,
3221 const Vec<T, 64> &b)
3222
3223{
3224 const __m512i mask = _mm512_set_epi64(
3225 (AB3 << 3) | (2 * I3 + 1), (AB3 << 3) | (2 * I3), (AB2 << 3) | (2 * I2 + 1),
3226 (AB2 << 3) | (2 * I2), (AB1 << 3) | (2 * I1 + 1), (AB1 << 3) | (2 * I1),
3227 (AB0 << 3) | (2 * I0 + 1), (AB0 << 3) | (2 * I0));
3228 // reinterpret as Int in case T is not an integer type
3229 const Vec<Int, 64> res = _mm512_permutex2var_epi64(
3230 reinterpret(a, OutputType<Int>()), mask, reinterpret(b, OutputType<Int>()));
3231 return reinterpret(res, OutputType<T>());
3232}
3233
3234// ---------------------------------------------------------------------------
3235// alignre v
3236// ---------------------------------------------------------------------------
3237
3238// Li, Hi: lanes
3239// n = IMM * sizeof(T) [#bytes]
3240//
3241// input: H0 H1 H2 H3
3242// L0 L1 L2 L3 NB
3243// ==================
3244// n<16: L1 L2 L3 H0 L,H 1
3245// L0 L1 L2 L3 L,H 0
3246// ------------------
3247// n<32: L2 L3 H0 H1 L,H 2
3248// L1 L2 L3 H0 L,H 1
3249// ------------------
3250// n<48: L3 H0 H1 H2 L,H 3
3251// L2 L3 H0 H1 L,H 2
3252// ------------------
3253// n<64: H0 H1 H2 H3 L,H 4
3254// L3 H0 H1 H2 L,H 3
3255// ------------------
3256// n<80: H1 H2 H3 0 H,0 1
3257// H0 H1 H2 H3 H,0 0
3258// ------------------
3259// n<96: H2 H3 0 0 H,0 2
3260// H1 H2 H3 0 H,0 1
3261// ------------------
3262// n<112: H3 0 0 0 H,0 3
3263// H2 H3 0 0 H,0 2
3264// ------------------
3265// n<128: 0 0 0 0 H,0 4
3266// H3 0 0 0 H,0 3
3267
3268// align_64_16 v (helper for alignre)
3269
3270// 16-byte lanes: AB0 I0 AB1 I1 AB2 I2 AB3 I3
3271// NB=0: a0 a1 a2 a3 0 0 0 1 0 2 0 3
3272// NB=1: a1 a2 a3 b0 0 1 0 2 0 3 1 0
3273// NB=2: a2 a3 b0 b1 0 2 0 3 1 0 1 1
3274// NB=3: a3 b0 b1 b2 0 3 1 0 1 1 1 2
3275// NB=4: b0 b1 b2 b3 1 0 1 1 1 2 1 3
3276
3277template <size_t NB, typename T>
3278static SIMD_INLINE Vec<T, 64> align_64_16(const Vec<T, 64> &a,
3279 const Vec<T, 64> &b)
3280{
3281 SIMD_IF_CONSTEXPR (NB == 0) {
3282 return a;
3283 } else SIMD_IF_CONSTEXPR (NB == 4) {
3284 return b;
3285 } else {
3286 return permute_64_16<(NB > 3), (NB % 4), (NB > 2), (NB + 1) % 4, (NB > 1),
3287 (NB + 2) % 4, (NB > 0), (NB + 3) % 4>(a, b);
3288 }
3289}
3290
3291// COUNT: in elements
3292template <size_t COUNT, typename T>
3293static SIMD_INLINE Vec<T, 64> alignre(const Vec<T, 64> &h, const Vec<T, 64> &l)
3294{
3295 const auto byteShift = COUNT * sizeof(T);
3296 SIMD_IF_CONSTEXPR (byteShift < 128) {
3297 const auto laneShift = byteShift / 16;
3298 const Vec<T, 64> L = (byteShift < 64) ? l : h;
3299 const Vec<T, 64> H =
3300 (byteShift < 64) ? h : setzero(OutputType<T>(), Integer<64>());
3301 const Vec<T, 64> ll = align_64_16<laneShift % 4>(L, H);
3302 const Vec<T, 64> hh = align_64_16<laneShift % 4 + 1>(L, H);
3303 return reinterpret(Vec<Byte, 64>(x_mm512_alignr_epi8<byteShift % 16>(
3304 reinterpret(hh, OutputType<Byte>()),
3305 reinterpret(ll, OutputType<Byte>()))),
3306 OutputType<T>());
3307 } else {
3308 return setzero(OutputType<T>(), Integer<64>());
3309 }
3310}
3311
3312// ---------------------------------------------------------------------------
3313// srle: element-wise right shift (via alignre) v
3314// ---------------------------------------------------------------------------
3315
3316// TODO: srle: solution with byte-wise shift intrinsics instead of align?
3317
3318// COUNT: in elements
3319template <size_t COUNT, typename T>
3320static SIMD_INLINE Vec<T, 64> srle(const Vec<T, 64> &a)
3321{
3322 SIMD_IF_CONSTEXPR (COUNT < Vec<T, 64>::elements) {
3323 return alignre<COUNT>(setzero(OutputType<T>(), Integer<64>()), a);
3324 } else {
3325 return setzero(OutputType<T>(), Integer<64>());
3326 }
3327}
3328
3329// ---------------------------------------------------------------------------
3330// slle: element-wise left shift (via alignre) v
3331// ---------------------------------------------------------------------------
3332
3333// TODO: slle: solution with byte-wise shift intrinsics instead of align?
3334
3335// COUNT: in elements
3336template <size_t COUNT, typename T>
3337static SIMD_INLINE Vec<T, 64> slle(const Vec<T, 64> &a)
3338{
3339 SIMD_IF_CONSTEXPR (COUNT < Vec<T, 64>::elements) {
3340 return alignre<Vec<T, 64>::elements - COUNT>(
3341 a, setzero(OutputType<T>(), Integer<64>()));
3342 } else {
3343 return setzero(OutputType<T>(), Integer<64>());
3344 }
3345}
3346
3347// ---------------------------------------------------------------------------
3348// swizzle v
3349// ---------------------------------------------------------------------------
3350
3351// ---------- swizzle aux functions -----------
3352
3353// alignoff is the element-wise offset (relates to size of byte)
3354template <size_t ALIGNOFF>
3355static SIMD_INLINE __m512i align_shuffle_512(__m512i lo, __m512i hi,
3356 __m512i mask)
3357{
3358 static_assert(ALIGNOFF < 32, "");
3359 return x_mm512_shuffle_epi8(x_mm512_alignr_epi8<ALIGNOFF>(hi, lo), mask);
3360}
3361
3362// swizzle_64_16: swizzling of 128-bit lanes (for swizzle) v
3363
3364// each block (e.g. h2) is a 128-bit lane:
3365//
3366// example:
3367//
3368// ----v[0]---|----v[1]---
3369// n=2: l0 L0 h0 H0 l1 L1 h1 H1
3370// -- -- -- --
3371// -- -- -- --
3372// -> l0 h0 l1 h1 L0 H0 L1 H1
3373// -----------|-----------
3374//
3375//
3376// ----v[0]---|----v[1]---|----v[2]---
3377// n=3: l0 L0 h0 H0 l1 L1 h1 H1 l2 L2 h2 H2
3378// -- -- -- --
3379// -- -- -- --
3380// -- -- -- --
3381// -> l0 H0 h1 L2 L0 l1 H1 h2 h0 L1 l2 H2
3382// -----------|-----------|-----------
3383//
3384//
3385// ----v[0]---|----v[1]---|----v[2]---|----v[3]---
3386// n=4: l0 L0 h0 H0 l1 L1 h1 H1 l2 L2 h2 H2 l3 L3 h3 H3
3387// -- -- -- --
3388// -- -- -- --
3389// -- -- -- --
3390// -- -- -- --
3391// -> l0 l1 l2 l3 L0 L1 L2 L3 h0 h1 h2 h3 H0 H1 H2 H3
3392// -----------|-----------|-----------|-----------
3393//
3394//
3395// ----v[0]---|----v[1]---|----v[2]---|----v[3]---|----v[4]---
3396// n=5: l0 L0 h0 H0 l1 L1 h1 H1 l2 L2 h2 H2 l3 L3 h3 H3 l4 L4 h4 H4
3397// -- -- -- --
3398// -- -- -- --
3399// -- -- -- --
3400// -- -- -- --
3401// -- -- -- --
3402// -> l0 L1 h2 H3 L0 h1 H2 l4 h0 H1 l3 L4 H0 l2 L3 h4 l1 L2 h3 H4
3403// -----------|-----------|-----------|-----------|-----------
3404
3405// primary template
3406template <size_t N, typename T>
3407struct Swizzle_64_16;
3408
3409// N=2
3410// vIn: 0 1 2 3 | 4 5 6 7
3411// vOut: 0 2 4 6 | 1 3 5 7
3412template <typename T>
3413struct Swizzle_64_16<2, T>
3414{
3415 static SIMD_INLINE void _swizzle_64_16(const Vec<T, 64> vIn[2],
3416 Vec<T, 64> vOut[2])
3417 {
3418 vOut[0] = permute_64_16<0, 0, 0, 2, 1, 0, 1, 2>(vIn[0], vIn[1]);
3419 vOut[1] = permute_64_16<0, 1, 0, 3, 1, 1, 1, 3>(vIn[0], vIn[1]);
3420 }
3421};
3422
3423// N=3
3424// vIn: 0 1 2 3 | 4 5 6 7 | 8 9 10 11
3425// vTmp: 0 6 1 4 | 7 10 5 8 | 3 9 2 11
3426// vOut: 0 3 6 9 | 1 4 7 10 | 2 5 8 11
3427template <typename T>
3428struct Swizzle_64_16<3, T>
3429{
3430 static SIMD_INLINE void _swizzle_64_16(const Vec<T, 64> vIn[3],
3431 Vec<T, 64> vOut[3])
3432 {
3433 Vec<T, 64> vTmp[3];
3434 vTmp[0] = permute_64_16<0, 0, 1, 2, 0, 1, 1, 0>(vIn[0], vIn[1]);
3435 vTmp[1] = permute_64_16<0, 3, 1, 2, 0, 1, 1, 0>(vIn[1], vIn[2]);
3436 vTmp[2] = permute_64_16<0, 3, 1, 1, 0, 2, 1, 3>(vIn[0], vIn[2]);
3437
3438 vOut[0] = permute_64_16<0, 0, 1, 0, 0, 1, 1, 1>(vTmp[0], vTmp[2]);
3439 vOut[1] = permute_64_16<0, 2, 0, 3, 1, 0, 1, 1>(vTmp[0], vTmp[1]);
3440 vOut[2] = permute_64_16<1, 2, 0, 2, 0, 3, 1, 3>(vTmp[1], vTmp[2]);
3441 }
3442};
3443
3444// N=4
3445// vIn: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15
3446// vTmp: 0 4 1 5 | 2 6 3 7 | 8 12 9 13 | 10 14 11 15
3447// vOut: 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
3448template <typename T>
3449struct Swizzle_64_16<4, T>
3450{
3451 static SIMD_INLINE void _swizzle_64_16(const Vec<T, 64> vIn[4],
3452 Vec<T, 64> vOut[4])
3453 {
3454 Vec<T, 64> vTmp[4];
3455 vTmp[0] = permute_64_16<0, 0, 1, 0, 0, 1, 1, 1>(vIn[0], vIn[1]);
3456 vTmp[1] = permute_64_16<0, 2, 1, 2, 0, 3, 1, 3>(vIn[0], vIn[1]);
3457 vTmp[2] = permute_64_16<0, 0, 1, 0, 0, 1, 1, 1>(vIn[2], vIn[3]);
3458 vTmp[3] = permute_64_16<0, 2, 1, 2, 0, 3, 1, 3>(vIn[2], vIn[3]);
3459
3460 vOut[0] = permute_64_16<0, 0, 0, 1, 1, 0, 1, 1>(vTmp[0], vTmp[2]);
3461 vOut[1] = permute_64_16<0, 2, 0, 3, 1, 2, 1, 3>(vTmp[0], vTmp[2]);
3462 vOut[2] = permute_64_16<0, 0, 0, 1, 1, 0, 1, 1>(vTmp[1], vTmp[3]);
3463 vOut[3] = permute_64_16<0, 2, 0, 3, 1, 2, 1, 3>(vTmp[1], vTmp[3]);
3464 }
3465};
3466
3467// N=5
3468// vIn: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3469// vTmp: 5 10 6 11 | 1 16 3 18 | 8 13 9 14 | 7 17 4 19 | 0 15 2 12
3470// vOut: 0 5 10 15 | 1 6 11 16 | 2 7 12 17 | 3 8 13 18 | 4 9 14 19
3471template <typename T>
3472struct Swizzle_64_16<5, T>
3473{
3474 static SIMD_INLINE void _swizzle_64_16(const Vec<T, 64> vIn[5],
3475 Vec<T, 64> vOut[5])
3476 {
3477 Vec<T, 64> vTmp[5];
3478 vTmp[0] = permute_64_16<0, 1, 1, 2, 0, 2, 1, 3>(vIn[1], vIn[2]);
3479 vTmp[1] = permute_64_16<0, 1, 1, 0, 0, 3, 1, 2>(vIn[0], vIn[4]);
3480 vTmp[2] = permute_64_16<0, 0, 1, 1, 0, 1, 1, 2>(vIn[2], vIn[3]);
3481 vTmp[3] = permute_64_16<0, 3, 1, 1, 0, 0, 1, 3>(vIn[1], vIn[4]);
3482 vTmp[4] = permute_64_16<0, 0, 1, 3, 0, 2, 1, 0>(vIn[0], vIn[3]);
3483
3484 vOut[0] = permute_64_16<1, 0, 0, 0, 0, 1, 1, 1>(vTmp[0], vTmp[4]);
3485 vOut[1] = permute_64_16<1, 0, 0, 2, 0, 3, 1, 1>(vTmp[0], vTmp[1]);
3486 vOut[2] = permute_64_16<1, 2, 0, 0, 1, 3, 0, 1>(vTmp[3], vTmp[4]);
3487 vOut[3] = permute_64_16<0, 2, 1, 0, 1, 1, 0, 3>(vTmp[1], vTmp[2]);
3488 vOut[4] = permute_64_16<1, 2, 0, 2, 0, 3, 1, 3>(vTmp[2], vTmp[3]);
3489 }
3490};
3491
3492// swizzle lanes (for implementation of swizzle functions)
3493template <size_t N, typename T>
3494static SIMD_INLINE void swizzle_64_16(const Vec<T, 64> vIn[N],
3495 Vec<T, 64> vOut[N])
3496{
3497 Swizzle_64_16<N, T>::_swizzle_64_16(vIn, vOut);
3498}
3499
3500// ---------- swizzle (AoS to SoA) ----------
3501
3502// 01. Apr 23 (Jonas Keller): switched from using tag dispatching to using
3503// enable_if SFINAE, which allows more cases with the same implementation
3504// to be combined
3505
3506// -------------------- n = 1 --------------------
3507
3508// all types
3509template <typename T>
3510static SIMD_INLINE void swizzle(Vec<T, 64>[1], Integer<1>)
3511{
3512 // v remains unchanged
3513}
3514
3515// -------------------- n = 2 --------------------
3516
3517// 8 and 16 bit integer types
3518template <typename T,
3519 SIMD_ENABLE_IF((sizeof(T) <= 2 && std::is_integral<T>::value))>
3520static SIMD_INLINE void swizzle(Vec<T, 64> v[2], Integer<2>)
3521{
3522 Vec<T, 64> vs[2];
3523 swizzle_64_16<2>(v, vs);
3524 const __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<2, T>());
3525 const __m512i s[2] = {
3526 x_mm512_shuffle_epi8(vs[0], mask),
3527 x_mm512_shuffle_epi8(vs[1], mask),
3528 };
3529 v[0] = _mm512_unpacklo_epi64(s[0], s[1]);
3530 v[1] = _mm512_unpackhi_epi64(s[0], s[1]);
3531}
3532
3533// 32 bit types
3534template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3535static SIMD_INLINE void swizzle(Vec<T, 64> v[2], Integer<2>)
3536{
3537 const Vec<Float, 64> vFloat[2] = {
3538 reinterpret(v[0], OutputType<Float>()),
3539 reinterpret(v[1], OutputType<Float>()),
3540 };
3541 Vec<Float, 64> vs[2];
3542 swizzle_64_16<2>(vFloat, vs);
3543 const Vec<Float, 64> vOut[2] = {
3544 _mm512_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(2, 0, 2, 0)),
3545 _mm512_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(3, 1, 3, 1)),
3546 };
3547 v[0] = reinterpret(vOut[0], OutputType<T>());
3548 v[1] = reinterpret(vOut[1], OutputType<T>());
3549}
3550
3551// 64 bit types
3552template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3553 typename = void>
3554static SIMD_INLINE void swizzle(Vec<T, 64> v[2], Integer<2>)
3555{
3556 const Vec<Double, 64> vDouble[2] = {
3557 reinterpret(v[0], OutputType<Double>()),
3558 reinterpret(v[1], OutputType<Double>()),
3559 };
3560 Vec<Double, 64> vs[2];
3561 swizzle_64_16<2>(vDouble, vs);
3562 const Vec<Double, 64> vOut[2] = {
3563 _mm512_shuffle_pd(vs[0], vs[1], 0x00),
3564 _mm512_shuffle_pd(vs[0], vs[1], 0xFF),
3565 };
3566 v[0] = reinterpret(vOut[0], OutputType<T>());
3567 v[1] = reinterpret(vOut[1], OutputType<T>());
3568}
3569
3570// -------------------- n = 3 --------------------
3571
3572// 8 and 16 bit integer types
3573template <typename T,
3574 SIMD_ENABLE_IF((sizeof(T) <= 2 && std::is_integral<T>::value))>
3575static SIMD_INLINE void swizzle(Vec<T, 64> v[3], Integer<3>)
3576{
3577 Vec<T, 64> vs[3];
3578 swizzle_64_16<3>(v, vs);
3579 __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<3, T>());
3580 __m512i s0 = align_shuffle_512<0>(vs[0], vs[1], mask);
3581 __m512i s1 = align_shuffle_512<12>(vs[0], vs[1], mask);
3582 __m512i s2 = align_shuffle_512<8>(vs[1], vs[2], mask);
3583 __m512i s3 = align_shuffle_512<4>(vs[2], _mm512_undefined_epi32(), mask);
3584 __m512i l01 = _mm512_unpacklo_epi32(s0, s1);
3585 __m512i h01 = _mm512_unpackhi_epi32(s0, s1);
3586 __m512i l23 = _mm512_unpacklo_epi32(s2, s3);
3587 __m512i h23 = _mm512_unpackhi_epi32(s2, s3);
3588 v[0] = _mm512_unpacklo_epi64(l01, l23);
3589 v[1] = _mm512_unpackhi_epi64(l01, l23);
3590 v[2] = _mm512_unpacklo_epi64(h01, h23);
3591}
3592
3593// 32 bit types
3594// from Stan Melax: "3D Vector Normalization..."
3595// https://software.intel.com/en-us/articles/3d-vector-normalization-using-512-bit-intel-advanced-vector-extensions-intel-avx
3596template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3597static SIMD_INLINE void swizzle(Vec<T, 64> v[3], Integer<3>)
3598{
3599 const Vec<Float, 64> vFloat[3] = {
3600 reinterpret(v[0], OutputType<Float>()),
3601 reinterpret(v[1], OutputType<Float>()),
3602 reinterpret(v[2], OutputType<Float>()),
3603 };
3604 Vec<Float, 64> vs[3];
3605 swizzle_64_16<3>(vFloat, vs);
3606 // x0y0z0x1 = v[0]
3607 // y1z1x2y2 = v[1]
3608 // z2x3y3z3 = v[2]
3609 __m512 x2y2x3y3 = _mm512_shuffle_ps(vs[1], vs[2], _MM_SHUFFLE(2, 1, 3, 2));
3610 __m512 y0z0y1z1 = _mm512_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(1, 0, 2, 1));
3611 // x0x1x2x3
3612 const Vec<Float, 64> vOut0 =
3613 _mm512_shuffle_ps(vs[0], x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
3614 // y0y1y2y3
3615 const Vec<Float, 64> vOut1 =
3616 _mm512_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
3617 // z0z1z2z3
3618 const Vec<Float, 64> vOut2 =
3619 _mm512_shuffle_ps(y0z0y1z1, vs[2], _MM_SHUFFLE(3, 0, 3, 1));
3620 v[0] = reinterpret(vOut0, OutputType<T>());
3621 v[1] = reinterpret(vOut1, OutputType<T>());
3622 v[2] = reinterpret(vOut2, OutputType<T>());
3623}
3624
3625// 64 bit types
3626template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3627 typename = void>
3628static SIMD_INLINE void swizzle(Vec<T, 64> v[3], Integer<3>)
3629{
3630 const Vec<Double, 64> vDouble[3] = {
3631 reinterpret(v[0], OutputType<Double>()),
3632 reinterpret(v[1], OutputType<Double>()),
3633 reinterpret(v[2], OutputType<Double>()),
3634 };
3635 Vec<Double, 64> vs[3];
3636 swizzle_64_16<3>(vDouble, vs);
3637 const Vec<Double, 64> vOut[3] = {
3638 _mm512_shuffle_pd(vs[0], vs[1], 0xaa), // 0b1010_1010
3639 _mm512_shuffle_pd(vs[0], vs[2], 0x55), // 0b0101_0101
3640 _mm512_shuffle_pd(vs[1], vs[2], 0xaa), // 0b1010_1010
3641 };
3642 v[0] = reinterpret(vOut[0], OutputType<T>());
3643 v[1] = reinterpret(vOut[1], OutputType<T>());
3644 v[2] = reinterpret(vOut[2], OutputType<T>());
3645}
3646
3647// -------------------- n = 4 --------------------
3648
3649// 8 and 16 bit integer types
3650template <typename T,
3651 SIMD_ENABLE_IF((sizeof(T) <= 2 && std::is_integral<T>::value))>
3652static SIMD_INLINE void swizzle(Vec<T, 64> v[4], Integer<4>)
3653{
3654 Vec<T, 64> vs[4];
3655 swizzle_64_16<4>(v, vs);
3656 __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<4, T>());
3657 __m512i s[4];
3658 for (size_t j = 0; j < 4; j++) s[j] = x_mm512_shuffle_epi8(vs[j], mask);
3659 __m512i l01 = _mm512_unpacklo_epi32(s[0], s[1]);
3660 __m512i h01 = _mm512_unpackhi_epi32(s[0], s[1]);
3661 __m512i l23 = _mm512_unpacklo_epi32(s[2], s[3]);
3662 __m512i h23 = _mm512_unpackhi_epi32(s[2], s[3]);
3663 v[0] = _mm512_unpacklo_epi64(l01, l23);
3664 v[1] = _mm512_unpackhi_epi64(l01, l23);
3665 v[2] = _mm512_unpacklo_epi64(h01, h23);
3666 v[3] = _mm512_unpackhi_epi64(h01, h23);
3667}
3668
3669// 32 bit types
3670template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3671static SIMD_INLINE void swizzle(Vec<T, 64> v[4], Integer<4>)
3672{
3673 Vec<Int, 64> vInt[4];
3674 for (size_t i = 0; i < 4; i++) vInt[i] = reinterpret(v[i], OutputType<Int>());
3675 Vec<Int, 64> vs[4];
3676 swizzle_64_16<4>(vInt, vs);
3677 const __m512i s[4] = {
3678 _mm512_unpacklo_epi32(vs[0], vs[1]),
3679 _mm512_unpackhi_epi32(vs[0], vs[1]),
3680 _mm512_unpacklo_epi32(vs[2], vs[3]),
3681 _mm512_unpackhi_epi32(vs[2], vs[3]),
3682 };
3683 const Vec<Int, 64> vOut[4] = {
3684 _mm512_unpacklo_epi64(s[0], s[2]),
3685 _mm512_unpackhi_epi64(s[0], s[2]),
3686 _mm512_unpacklo_epi64(s[1], s[3]),
3687 _mm512_unpackhi_epi64(s[1], s[3]),
3688 };
3689 for (size_t i = 0; i < 4; i++) v[i] = reinterpret(vOut[i], OutputType<T>());
3690}
3691
3692// 64 bit types
3693template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3694 typename = void>
3695static SIMD_INLINE void swizzle(Vec<T, 64> v[4], Integer<4>)
3696{
3697 Vec<Double, 64> vDouble[4];
3698 for (size_t i = 0; i < 4; i++)
3699 vDouble[i] = reinterpret(v[i], OutputType<Double>());
3700 Vec<Double, 64> vs[4];
3701 swizzle_64_16<4>(vDouble, vs);
3702 const Vec<Double, 64> vOut[4] = {
3703 _mm512_shuffle_pd(vs[0], vs[2], 0x00), // 0b0000_0000
3704 _mm512_shuffle_pd(vs[0], vs[2], 0xFF), // 0b1111_1111
3705 _mm512_shuffle_pd(vs[1], vs[3], 0x00), // 0b0000_0000
3706 _mm512_shuffle_pd(vs[1], vs[3], 0xFF), // 0b1111_1111
3707 };
3708 for (size_t i = 0; i < 4; i++) v[i] = reinterpret(vOut[i], OutputType<T>());
3709}
3710
3711// -------------------- n = 5 --------------------
3712
3713// 8 bit integer types
3714template <typename T,
3715 SIMD_ENABLE_IF(sizeof(T) == 1 && std::is_integral<T>::value)>
3716static SIMD_INLINE void swizzle(Vec<T, 64> v[5], Integer<5>)
3717{
3718 Vec<T, 64> vs[5];
3719 swizzle_64_16<5>(v, vs);
3720 const __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<5, T>());
3721 const __m512i s[8] = {
3722 align_shuffle_512<0>(vs[0], vs[1], mask),
3723 align_shuffle_512<10>(vs[0], vs[1], mask),
3724 align_shuffle_512<4>(vs[1], vs[2], mask),
3725 align_shuffle_512<14>(vs[1], vs[2], mask),
3726 align_shuffle_512<8>(vs[2], vs[3], mask),
3727 align_shuffle_512<2>(vs[3], vs[4], mask),
3728 align_shuffle_512<12>(vs[3], vs[4], mask),
3729 align_shuffle_512<6>(vs[4], _mm512_undefined_epi32(), mask),
3730 };
3731 __m512i l01 = x_mm512_unpacklo_epi16(s[0], s[1]);
3732 __m512i h01 = x_mm512_unpackhi_epi16(s[0], s[1]);
3733 __m512i l23 = x_mm512_unpacklo_epi16(s[2], s[3]);
3734 __m512i h23 = x_mm512_unpackhi_epi16(s[2], s[3]);
3735 __m512i l45 = x_mm512_unpacklo_epi16(s[4], s[5]);
3736 __m512i h45 = x_mm512_unpackhi_epi16(s[4], s[5]);
3737 __m512i l67 = x_mm512_unpacklo_epi16(s[6], s[7]);
3738 __m512i h67 = x_mm512_unpackhi_epi16(s[6], s[7]);
3739 __m512i ll01l23 = _mm512_unpacklo_epi32(l01, l23);
3740 __m512i hl01l23 = _mm512_unpackhi_epi32(l01, l23);
3741 __m512i ll45l67 = _mm512_unpacklo_epi32(l45, l67);
3742 __m512i hl45l67 = _mm512_unpackhi_epi32(l45, l67);
3743 __m512i lh01h23 = _mm512_unpacklo_epi32(h01, h23);
3744 __m512i lh45h67 = _mm512_unpacklo_epi32(h45, h67);
3745 v[0] = _mm512_unpacklo_epi64(ll01l23, ll45l67);
3746 v[1] = _mm512_unpackhi_epi64(ll01l23, ll45l67);
3747 v[2] = _mm512_unpacklo_epi64(hl01l23, hl45l67);
3748 v[3] = _mm512_unpackhi_epi64(hl01l23, hl45l67);
3749 v[4] = _mm512_unpacklo_epi64(lh01h23, lh45h67);
3750}
3751
3752// 16 bit integer types
3753template <typename T,
3754 SIMD_ENABLE_IF(sizeof(T) == 2 && std::is_integral<T>::value),
3755 typename = void>
3756static SIMD_INLINE void swizzle(Vec<T, 64> v[5], Integer<5>)
3757{
3758 Vec<T, 64> vs[5];
3759 swizzle_64_16<5>(v, vs);
3760 const __m512i mask = _mm512_broadcast_i32x4(get_swizzle_mask<5, T>());
3761 const __m512i s[8] = {
3762 align_shuffle_512<0>(vs[0], vs[1], mask),
3763 align_shuffle_512<6>(vs[0], vs[1], mask),
3764 align_shuffle_512<4>(vs[1], vs[2], mask),
3765 align_shuffle_512<10>(vs[1], vs[2], mask),
3766 align_shuffle_512<8>(vs[2], vs[3], mask),
3767 align_shuffle_512<14>(vs[2], vs[3], mask),
3768 align_shuffle_512<12>(vs[3], vs[4], mask),
3769 align_shuffle_512<2>(vs[4], _mm512_undefined_epi32(), mask),
3770 };
3771 __m512i l02 = _mm512_unpacklo_epi32(s[0], s[2]);
3772 __m512i h02 = _mm512_unpackhi_epi32(s[0], s[2]);
3773 __m512i l13 = _mm512_unpacklo_epi32(s[1], s[3]);
3774 __m512i l46 = _mm512_unpacklo_epi32(s[4], s[6]);
3775 __m512i h46 = _mm512_unpackhi_epi32(s[4], s[6]);
3776 __m512i l57 = _mm512_unpacklo_epi32(s[5], s[7]);
3777 v[0] = _mm512_unpacklo_epi64(l02, l46);
3778 v[1] = _mm512_unpackhi_epi64(l02, l46);
3779 v[2] = _mm512_unpacklo_epi64(h02, h46);
3780 v[3] = _mm512_unpacklo_epi64(l13, l57);
3781 v[4] = _mm512_unpackhi_epi64(l13, l57);
3782}
3783
3784// 32 bit types
3785template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void,
3786 typename = void>
3787static SIMD_INLINE void swizzle(Vec<T, 64> v[5], Integer<5>)
3788{
3789 Vec<Int, 64> vInt[5];
3790 for (size_t i = 0; i < 5; i++) {
3791 vInt[i] = reinterpret(v[i], OutputType<Int>());
3792 }
3793 Vec<Int, 64> vs[5];
3794 swizzle_64_16<5>(vInt, vs);
3795 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3796 // v[0]: 0 1 2 3
3797 // v[1]: 4 x x x
3798 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3799 // x x x x
3800 // 5 6 7 8
3801 __m512i s2 = x_mm512_alignr_epi8<4>(vs[2], vs[1]);
3802 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3803 // x x x x
3804 // 9 x x x
3805 __m512i s3 = x_mm512_alignr_epi8<4>(vs[3], vs[2]);
3806 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3807 // x x x x
3808 // 10 11 12 13
3809 __m512i s4 = x_mm512_alignr_epi8<8>(vs[3], vs[2]);
3810 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3811 // x x x x
3812 // 14 x x x
3813 __m512i s5 = x_mm512_alignr_epi8<8>(vs[4], vs[3]);
3814 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3815 // X X X X
3816 // 15 16 17 18
3817 __m512i s6 = x_mm512_alignr_epi8<12>(vs[4], vs[3]);
3818 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3819 // X X X X
3820 // 19 x x x
3821 __m512i s7 = x_mm512_alignr_epi8<12>(vs[0], vs[4]);
3822 // 0 1 2 3 / 5 6 7 8 -> 0 5 1 6 / 2 7 3 8
3823 __m512i l02 = _mm512_unpacklo_epi32(vs[0], s2);
3824 __m512i h02 = _mm512_unpackhi_epi32(vs[0], s2);
3825 // 4 x x x / 9 x x x -> 4 9 x x
3826 __m512i l13 = _mm512_unpacklo_epi32(vs[1], s3);
3827 // 10 11 12 13 / 15 16 17 18 -> 10 15 11 13 / 12 17 13 18
3828 __m512i l46 = _mm512_unpacklo_epi32(s4, s6);
3829 __m512i h46 = _mm512_unpackhi_epi32(s4, s6);
3830 // 14 x x x / 19 x x x -> 14 19 x x
3831 __m512i l57 = _mm512_unpacklo_epi32(s5, s7);
3832 const Vec<Int, 64> vOut[5] = {
3833 // 0 5 1 6 / 10 15 11 13 -> 0 5 10 15 / 1 6 11 16
3834 _mm512_unpacklo_epi64(l02, l46),
3835 _mm512_unpackhi_epi64(l02, l46),
3836 // 2 7 3 8 / 12 17 13 18 -> 2 7 12 17 / 3 8 13 18
3837 _mm512_unpacklo_epi64(h02, h46),
3838 _mm512_unpackhi_epi64(h02, h46),
3839 // 4 9 x x / 14 19 x x -> 4 9 14 19
3840 _mm512_unpacklo_epi64(l13, l57),
3841 };
3842 for (size_t i = 0; i < 5; i++) {
3843 v[i] = reinterpret(vOut[i], OutputType<T>());
3844 }
3845}
3846
3847// 64 bit types
3848template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3849 typename = void, typename = void>
3850static SIMD_INLINE void swizzle(Vec<T, 64> v[5], Integer<5>)
3851{
3852 Vec<Double, 64> vDouble[5];
3853 for (size_t i = 0; i < 5; i++) {
3854 vDouble[i] = reinterpret(v[i], OutputType<Double>());
3855 }
3856 Vec<Double, 64> vs[5];
3857 swizzle_64_16<5>(vDouble, vs);
3858 const Vec<Double, 64> vOut[5] = {
3859 _mm512_shuffle_pd(vs[0], vs[2], 0xaa), // 0b1010_1010
3860 _mm512_shuffle_pd(vs[0], vs[3], 0x55), // 0b0101_0101
3861 _mm512_shuffle_pd(vs[1], vs[3], 0xaa), // 0b1010_1010
3862 _mm512_shuffle_pd(vs[1], vs[4], 0x55), // 0b0101_0101
3863 _mm512_shuffle_pd(vs[2], vs[4], 0xaa), // 0b1010_1010
3864 };
3865 for (size_t i = 0; i < 5; i++) {
3866 v[i] = reinterpret(vOut[i], OutputType<T>());
3867 }
3868}
3869
3870// ---------------------------------------------------------------------------
3871// comparison functions
3872// ---------------------------------------------------------------------------
3873
3874// 28. Mar 23 (Jonas Keller): checked the constants for _mm512_cmp_ps_mask in
3875// the Float comparison functions, they match the implementation of the SSE
3876// versions (see cmpps in Intel manual) and added corresponding comments
3877
3878// ---------------------------------------------------------------------------
3879// compare < v
3880// ---------------------------------------------------------------------------
3881
3882// https://stackoverflow.com/questions/48099006/
3883// different-semantic-of-comparison-intrinsic-instructions-in-avx512
3884
3885#ifdef __AVX512BW__
3886
3887static SIMD_INLINE Vec<Byte, 64> cmplt(const Vec<Byte, 64> &a,
3888 const Vec<Byte, 64> &b)
3889{
3890 return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(a, b));
3891}
3892
3893static SIMD_INLINE Vec<SignedByte, 64> cmplt(const Vec<SignedByte, 64> &a,
3894 const Vec<SignedByte, 64> &b)
3895{
3896 return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(a, b));
3897}
3898
3899static SIMD_INLINE Vec<Word, 64> cmplt(const Vec<Word, 64> &a,
3900 const Vec<Word, 64> &b)
3901{
3902 return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(a, b));
3903}
3904
3905static SIMD_INLINE Vec<Short, 64> cmplt(const Vec<Short, 64> &a,
3906 const Vec<Short, 64> &b)
3907{
3908 return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(a, b));
3909}
3910
3911#else
3912
3913// non-avx512bw workaround
3914template <typename T>
3915static SIMD_INLINE Vec<T, 64> cmplt(const Vec<T, 64> &a, const Vec<T, 64> &b)
3916{
3917 return Vec<T, 64>(cmplt(a.lo(), b.lo()), cmplt(a.hi(), b.hi()));
3918}
3919
3920#endif
3921
3922static SIMD_INLINE Vec<Int, 64> cmplt(const Vec<Int, 64> &a,
3923 const Vec<Int, 64> &b)
3924{
3925 return x_mm512_movm_epi32(_mm512_cmplt_epi32_mask(a, b));
3926}
3927
3928static SIMD_INLINE Vec<Long, 64> cmplt(const Vec<Long, 64> &a,
3929 const Vec<Long, 64> &b)
3930{
3931 return x_mm512_movm_epi64(_mm512_cmplt_epi64_mask(a, b));
3932}
3933
3934static SIMD_INLINE Vec<Float, 64> cmplt(const Vec<Float, 64> &a,
3935 const Vec<Float, 64> &b)
3936{
3937 // same constant as in implementation of _mm_cmplt_ps (see cmpps instruction
3938 // in Intel manual)
3939 return _mm512_castsi512_ps(
3940 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_LT_OS)));
3941}
3942
3943static SIMD_INLINE Vec<Double, 64> cmplt(const Vec<Double, 64> &a,
3944 const Vec<Double, 64> &b)
3945{
3946 // same constant as in implementation of _mm_cmplt_pd (see cmppd instruction
3947 // in Intel manual)
3948 return _mm512_castsi512_pd(
3949 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_LT_OS)));
3950}
3951
3952// ---------------------------------------------------------------------------
3953// compare <= v
3954// ---------------------------------------------------------------------------
3955
3956// https://stackoverflow.com/questions/48099006/
3957// different-semantic-of-comparison-intrinsic-instructions-in-avx512
3958
3959#ifdef __AVX512BW__
3960
3961static SIMD_INLINE Vec<Byte, 64> cmple(const Vec<Byte, 64> &a,
3962 const Vec<Byte, 64> &b)
3963{
3964 return _mm512_movm_epi8(_mm512_cmple_epu8_mask(a, b));
3965}
3966
3967static SIMD_INLINE Vec<SignedByte, 64> cmple(const Vec<SignedByte, 64> &a,
3968 const Vec<SignedByte, 64> &b)
3969{
3970 return _mm512_movm_epi8(_mm512_cmple_epi8_mask(a, b));
3971}
3972
3973static SIMD_INLINE Vec<Word, 64> cmple(const Vec<Word, 64> &a,
3974 const Vec<Word, 64> &b)
3975{
3976 return _mm512_movm_epi16(_mm512_cmple_epu16_mask(a, b));
3977}
3978
3979static SIMD_INLINE Vec<Short, 64> cmple(const Vec<Short, 64> &a,
3980 const Vec<Short, 64> &b)
3981{
3982 return _mm512_movm_epi16(_mm512_cmple_epi16_mask(a, b));
3983}
3984
3985#else
3986
3987// non-avx512bw workaround
3988template <typename T>
3989static SIMD_INLINE Vec<T, 64> cmple(const Vec<T, 64> &a, const Vec<T, 64> &b)
3990{
3991 return Vec<T, 64>(cmple(a.lo(), b.lo()), cmple(a.hi(), b.hi()));
3992}
3993
3994#endif
3995
3996static SIMD_INLINE Vec<Int, 64> cmple(const Vec<Int, 64> &a,
3997 const Vec<Int, 64> &b)
3998{
3999 return x_mm512_movm_epi32(_mm512_cmple_epi32_mask(a, b));
4000}
4001
4002static SIMD_INLINE Vec<Long, 64> cmple(const Vec<Long, 64> &a,
4003 const Vec<Long, 64> &b)
4004{
4005 return x_mm512_movm_epi64(_mm512_cmple_epi64_mask(a, b));
4006}
4007
4008static SIMD_INLINE Vec<Float, 64> cmple(const Vec<Float, 64> &a,
4009 const Vec<Float, 64> &b)
4010{
4011 // same constant as in implementation of _mm_cmple_ps (see cmpps instruction
4012 // in Intel manual)
4013 return _mm512_castsi512_ps(
4014 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_LE_OS)));
4015}
4016
4017static SIMD_INLINE Vec<Double, 64> cmple(const Vec<Double, 64> &a,
4018 const Vec<Double, 64> &b)
4019{
4020 // same constant as in implementation of _mm_cmple_pd (see cmppd instruction
4021 // in Intel manual)
4022 return _mm512_castsi512_pd(
4023 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_LE_OS)));
4024}
4025
4026// ---------------------------------------------------------------------------
4027// compare == v
4028// ---------------------------------------------------------------------------
4029
4030// https://stackoverflow.com/questions/48099006/
4031// different-semantic-of-comparison-intrinsic-instructions-in-avx512
4032
4033#ifdef __AVX512BW__
4034
4035static SIMD_INLINE Vec<Byte, 64> cmpeq(const Vec<Byte, 64> &a,
4036 const Vec<Byte, 64> &b)
4037{
4038 return _mm512_movm_epi8(_mm512_cmpeq_epu8_mask(a, b));
4039}
4040
4041static SIMD_INLINE Vec<SignedByte, 64> cmpeq(const Vec<SignedByte, 64> &a,
4042 const Vec<SignedByte, 64> &b)
4043{
4044 return _mm512_movm_epi8(_mm512_cmpeq_epi8_mask(a, b));
4045}
4046
4047static SIMD_INLINE Vec<Word, 64> cmpeq(const Vec<Word, 64> &a,
4048 const Vec<Word, 64> &b)
4049{
4050 return _mm512_movm_epi16(_mm512_cmpeq_epu16_mask(a, b));
4051}
4052
4053static SIMD_INLINE Vec<Short, 64> cmpeq(const Vec<Short, 64> &a,
4054 const Vec<Short, 64> &b)
4055{
4056 return _mm512_movm_epi16(_mm512_cmpeq_epi16_mask(a, b));
4057}
4058
4059#else
4060
4061// non-avx512bw workaround
4062template <typename T>
4063static SIMD_INLINE Vec<T, 64> cmpeq(const Vec<T, 64> &a, const Vec<T, 64> &b)
4064{
4065 return Vec<T, 64>(cmpeq(a.lo(), b.lo()), cmpeq(a.hi(), b.hi()));
4066}
4067
4068#endif
4069
4070static SIMD_INLINE Vec<Int, 64> cmpeq(const Vec<Int, 64> &a,
4071 const Vec<Int, 64> &b)
4072{
4073 return x_mm512_movm_epi32(_mm512_cmpeq_epi32_mask(a, b));
4074}
4075
4076static SIMD_INLINE Vec<Long, 64> cmpeq(const Vec<Long, 64> &a,
4077 const Vec<Long, 64> &b)
4078{
4079 return x_mm512_movm_epi64(_mm512_cmpeq_epi64_mask(a, b));
4080}
4081
4082static SIMD_INLINE Vec<Float, 64> cmpeq(const Vec<Float, 64> &a,
4083 const Vec<Float, 64> &b)
4084{
4085 // same constant as in implementation of _mm_cmpeq_ps (see cmpps instruction
4086 // in Intel manual)
4087 return _mm512_castsi512_ps(
4088 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)));
4089}
4090
4091static SIMD_INLINE Vec<Double, 64> cmpeq(const Vec<Double, 64> &a,
4092 const Vec<Double, 64> &b)
4093{
4094 // same constant as in implementation of _mm_cmpeq_pd (see cmppd instruction
4095 // in Intel manual)
4096 return _mm512_castsi512_pd(
4097 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)));
4098}
4099
4100// ---------------------------------------------------------------------------
4101// compare > v
4102// ---------------------------------------------------------------------------
4103
4104// https://stackoverflow.com/questions/48099006/
4105// different-semantic-of-comparison-intrinsic-instructions-in-avx512
4106
4107#ifdef __AVX512BW__
4108
4109static SIMD_INLINE Vec<Byte, 64> cmpgt(const Vec<Byte, 64> &a,
4110 const Vec<Byte, 64> &b)
4111{
4112 return _mm512_movm_epi8(_mm512_cmpgt_epu8_mask(a, b));
4113}
4114
4115static SIMD_INLINE Vec<SignedByte, 64> cmpgt(const Vec<SignedByte, 64> &a,
4116 const Vec<SignedByte, 64> &b)
4117{
4118 return _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(a, b));
4119}
4120
4121static SIMD_INLINE Vec<Word, 64> cmpgt(const Vec<Word, 64> &a,
4122 const Vec<Word, 64> &b)
4123{
4124 return _mm512_movm_epi16(_mm512_cmpgt_epu16_mask(a, b));
4125}
4126
4127static SIMD_INLINE Vec<Short, 64> cmpgt(const Vec<Short, 64> &a,
4128 const Vec<Short, 64> &b)
4129{
4130 return _mm512_movm_epi16(_mm512_cmpgt_epi16_mask(a, b));
4131}
4132
4133#else
4134
4135// non-avx512bw workaround
4136template <typename T>
4137static SIMD_INLINE Vec<T, 64> cmpgt(const Vec<T, 64> &a, const Vec<T, 64> &b)
4138{
4139 return Vec<T, 64>(cmpgt(a.lo(), b.lo()), cmpgt(a.hi(), b.hi()));
4140}
4141
4142#endif
4143
4144static SIMD_INLINE Vec<Int, 64> cmpgt(const Vec<Int, 64> &a,
4145 const Vec<Int, 64> &b)
4146{
4147 return x_mm512_movm_epi32(_mm512_cmpgt_epi32_mask(a, b));
4148}
4149
4150static SIMD_INLINE Vec<Long, 64> cmpgt(const Vec<Long, 64> &a,
4151 const Vec<Long, 64> &b)
4152{
4153 return x_mm512_movm_epi64(_mm512_cmpgt_epi64_mask(a, b));
4154}
4155
4156static SIMD_INLINE Vec<Float, 64> cmpgt(const Vec<Float, 64> &a,
4157 const Vec<Float, 64> &b)
4158{
4159 // same constant as in implementation of _mm_cmplt_ps (see cmpps instruction
4160 // in Intel manual), except this is > instead of <
4161 return _mm512_castsi512_ps(
4162 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_GT_OS)));
4163}
4164
4165static SIMD_INLINE Vec<Double, 64> cmpgt(const Vec<Double, 64> &a,
4166 const Vec<Double, 64> &b)
4167{
4168 // same constant as in implementation of _mm_cmplt_pd (see cmppd instruction
4169 // in Intel manual), except this is > instead of <
4170 return _mm512_castsi512_pd(
4171 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_GT_OS)));
4172}
4173
4174// ---------------------------------------------------------------------------
4175// compare >= v
4176// ---------------------------------------------------------------------------
4177
4178// https://stackoverflow.com/questions/48099006/
4179// different-semantic-of-comparison-intrinsic-instructions-in-avx512
4180
4181#ifdef __AVX512BW__
4182
4183static SIMD_INLINE Vec<Byte, 64> cmpge(const Vec<Byte, 64> &a,
4184 const Vec<Byte, 64> &b)
4185{
4186 return _mm512_movm_epi8(_mm512_cmpge_epu8_mask(a, b));
4187}
4188
4189static SIMD_INLINE Vec<SignedByte, 64> cmpge(const Vec<SignedByte, 64> &a,
4190 const Vec<SignedByte, 64> &b)
4191{
4192 return _mm512_movm_epi8(_mm512_cmpge_epi8_mask(a, b));
4193}
4194
4195static SIMD_INLINE Vec<Word, 64> cmpge(const Vec<Word, 64> &a,
4196 const Vec<Word, 64> &b)
4197{
4198 return _mm512_movm_epi16(_mm512_cmpge_epu16_mask(a, b));
4199}
4200
4201static SIMD_INLINE Vec<Short, 64> cmpge(const Vec<Short, 64> &a,
4202 const Vec<Short, 64> &b)
4203{
4204 return _mm512_movm_epi16(_mm512_cmpge_epi16_mask(a, b));
4205}
4206
4207#else
4208
4209// non-avx512bw workaround
4210template <typename T>
4211static SIMD_INLINE Vec<T, 64> cmpge(const Vec<T, 64> &a, const Vec<T, 64> &b)
4212{
4213 return Vec<T, 64>(cmpge(a.lo(), b.lo()), cmpge(a.hi(), b.hi()));
4214}
4215
4216#endif
4217
4218static SIMD_INLINE Vec<Int, 64> cmpge(const Vec<Int, 64> &a,
4219 const Vec<Int, 64> &b)
4220{
4221 return x_mm512_movm_epi32(_mm512_cmpge_epi32_mask(a, b));
4222}
4223
4224static SIMD_INLINE Vec<Long, 64> cmpge(const Vec<Long, 64> &a,
4225 const Vec<Long, 64> &b)
4226{
4227 return x_mm512_movm_epi64(_mm512_cmpge_epi64_mask(a, b));
4228}
4229
4230static SIMD_INLINE Vec<Float, 64> cmpge(const Vec<Float, 64> &a,
4231 const Vec<Float, 64> &b)
4232{
4233 // same constant as in implementation of _mm_cmple_ps (see cmpps instruction
4234 // in Intel manual), except this is >= instead of <=
4235 return _mm512_castsi512_ps(
4236 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_GE_OS)));
4237}
4238
4239static SIMD_INLINE Vec<Double, 64> cmpge(const Vec<Double, 64> &a,
4240 const Vec<Double, 64> &b)
4241{
4242 // same constant as in implementation of _mm_cmple_pd (see cmppd instruction
4243 // in Intel manual), except this is >= instead of <=
4244 return _mm512_castsi512_pd(
4245 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_GE_OS)));
4246}
4247
4248// ---------------------------------------------------------------------------
4249// compare != v
4250// ---------------------------------------------------------------------------
4251
4252// https://stackoverflow.com/questions/48099006/
4253// different-semantic-of-comparison-intrinsic-instructions-in-avx512
4254
4255#ifdef __AVX512BW__
4256
4257static SIMD_INLINE Vec<Byte, 64> cmpneq(const Vec<Byte, 64> &a,
4258 const Vec<Byte, 64> &b)
4259{
4260 return _mm512_movm_epi8(_mm512_cmpneq_epu8_mask(a, b));
4261}
4262
4263static SIMD_INLINE Vec<SignedByte, 64> cmpneq(const Vec<SignedByte, 64> &a,
4264 const Vec<SignedByte, 64> &b)
4265{
4266 return _mm512_movm_epi8(_mm512_cmpneq_epi8_mask(a, b));
4267}
4268
4269static SIMD_INLINE Vec<Word, 64> cmpneq(const Vec<Word, 64> &a,
4270 const Vec<Word, 64> &b)
4271{
4272 return _mm512_movm_epi16(_mm512_cmpneq_epu16_mask(a, b));
4273}
4274
4275static SIMD_INLINE Vec<Short, 64> cmpneq(const Vec<Short, 64> &a,
4276 const Vec<Short, 64> &b)
4277{
4278 return _mm512_movm_epi16(_mm512_cmpneq_epi16_mask(a, b));
4279}
4280
4281#else
4282
4283// non-avx512bw workaround
4284template <typename T>
4285static SIMD_INLINE Vec<T, 64> cmpneq(const Vec<T, 64> &a, const Vec<T, 64> &b)
4286{
4287 return Vec<T, 64>(cmpneq(a.lo(), b.lo()), cmpneq(a.hi(), b.hi()));
4288}
4289
4290#endif
4291
4292static SIMD_INLINE Vec<Int, 64> cmpneq(const Vec<Int, 64> &a,
4293 const Vec<Int, 64> &b)
4294{
4295 return x_mm512_movm_epi32(_mm512_cmpneq_epi32_mask(a, b));
4296}
4297
4298static SIMD_INLINE Vec<Long, 64> cmpneq(const Vec<Long, 64> &a,
4299 const Vec<Long, 64> &b)
4300{
4301 return x_mm512_movm_epi64(_mm512_cmpneq_epi64_mask(a, b));
4302}
4303
4304static SIMD_INLINE Vec<Float, 64> cmpneq(const Vec<Float, 64> &a,
4305 const Vec<Float, 64> &b)
4306{
4307 // same constant as in implementation of _mm_cmpneq_ps (see cmpps instruction
4308 // in Intel manual)
4309 return _mm512_castsi512_ps(
4310 x_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_NEQ_OQ)));
4311}
4312
4313static SIMD_INLINE Vec<Double, 64> cmpneq(const Vec<Double, 64> &a,
4314 const Vec<Double, 64> &b)
4315{
4316 // same constant as in implementation of _mm_cmpneq_pd (see cmppd instruction
4317 // in Intel manual)
4318 return _mm512_castsi512_pd(
4319 x_mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_NEQ_OQ)));
4320}
4321
4322// ---------------------------------------------------------------------------
4323// ifelse v
4324// ---------------------------------------------------------------------------
4325
4326// 10. Apr 23 (Jonas Keller): made two versions of ifelse, one for 8 and 16 bit
4327// data types, and one for 32 and larger data types, so that for the latter
4328// the blendv instruction can be used even if avx512bw is not available
4329
4330// NOTE: only works if cond elements are all 1-bits or all 0-bits
4331
4332// version for 8 and 16 bit data types
4333template <typename T, SIMD_ENABLE_IF(sizeof(T) <= 2)>
4334static SIMD_INLINE Vec<T, 64> ifelse(const Vec<T, 64> &cond,
4335 const Vec<T, 64> &trueVal,
4336 const Vec<T, 64> &falseVal)
4337{
4338 // TODO: _mm512_movepi8_mask is slower than _mm512_or_si512, _mm512_and_si512
4339 // or _mm512_andnot_si512 according to the Intel Intrinsics Guide, maybe use
4340 // the non-avx512bw workaround always?
4341 // since _mm512_and_si512 and _mm512_andnot_si512 could potentially be
4342 // executed in parallel, that might be faster
4343#ifdef __AVX512BW__
4344 // cond -> __mask64
4345 const __mmask64 condReg =
4346 _mm512_movepi8_mask(reinterpret(cond, OutputType<Byte>()));
4347 // explicitly cast to __m512i to avoid compiler error with -O0
4348 const __m512i trueReg = (__m512i) reinterpret(trueVal, OutputType<Byte>());
4349 const __m512i falseReg = (__m512i) reinterpret(falseVal, OutputType<Byte>());
4350 const Vec<Byte, 64> res = _mm512_mask_blend_epi8(condReg, falseReg, trueReg);
4351#else
4352 const Vec<Byte, 64> res = _mm512_or_si512(
4353 _mm512_and_si512(reinterpret(cond, OutputType<Byte>()),
4354 reinterpret(trueVal, OutputType<Byte>())),
4355 _mm512_andnot_si512(reinterpret(cond, OutputType<Byte>()),
4356 reinterpret(falseVal, OutputType<Byte>())));
4357#endif
4358 return reinterpret(res, OutputType<T>());
4359}
4360
4361// version for 32 bit and larger data types
4362template <typename T, SIMD_ENABLE_IF(sizeof(T) > 2), typename = void>
4363static SIMD_INLINE Vec<T, 64> ifelse(const Vec<T, 64> &cond,
4364 const Vec<T, 64> &trueVal,
4365 const Vec<T, 64> &falseVal)
4366{
4367 // TODO: _mm512_movepi32_mask is slower than _mm512_or_si512, _mm512_and_si512
4368 // or _mm512_andnot_si512 according to the Intel Intrinsics Guide, maybe use
4369 // the non-avx512dq workaround always?
4370 // since _mm512_and_si512 and _mm512_andnot_si512 could potentially be
4371 // executed in parallel, that might be faster
4372#ifdef __AVX512DQ__
4373 // cond -> __mmask16
4374 const __mmask16 condReg =
4375 _mm512_movepi32_mask(reinterpret(cond, OutputType<Int>()));
4376 // explicitly cast to __m512i to avoid compiler error with -O0
4377 const __m512i trueReg = (__m512i) reinterpret(trueVal, OutputType<Int>());
4378 const __m512i falseReg = (__m512i) reinterpret(falseVal, OutputType<Int>());
4379 const Vec<Int, 64> res = _mm512_mask_blend_epi32(condReg, falseReg, trueReg);
4380#else
4381 const Vec<Int, 64> res = _mm512_or_si512(
4382 _mm512_and_si512(reinterpret(cond, OutputType<Int>()),
4383 reinterpret(trueVal, OutputType<Int>())),
4384 _mm512_andnot_si512(reinterpret(cond, OutputType<Int>()),
4385 reinterpret(falseVal, OutputType<Int>())));
4386#endif
4387 return reinterpret(res, OutputType<T>());
4388}
4389
4390// ---------------------------------------------------------------------------
4391// bit_and v
4392// ---------------------------------------------------------------------------
4393
4394template <typename T>
4395static SIMD_INLINE Vec<T, 64> bit_and(const Vec<T, 64> &a, const Vec<T, 64> &b)
4396{
4397 // reinterpret as byte for float and double versions
4398 const Vec<Byte, 64> res = _mm512_and_si512(
4399 reinterpret(a, OutputType<Byte>()), reinterpret(b, OutputType<Byte>()));
4400 return reinterpret(res, OutputType<T>());
4401}
4402
4403// ---------------------------------------------------------------------------
4404// bit_or v
4405// ---------------------------------------------------------------------------
4406
4407template <typename T>
4408static SIMD_INLINE Vec<T, 64> bit_or(const Vec<T, 64> &a, const Vec<T, 64> &b)
4409{
4410 // reinterpret as byte for float and double versions
4411 const Vec<Byte, 64> res = _mm512_or_si512(reinterpret(a, OutputType<Byte>()),
4412 reinterpret(b, OutputType<Byte>()));
4413 return reinterpret(res, OutputType<T>());
4414}
4415
4416// ---------------------------------------------------------------------------
4417// bit_andnot v
4418// ---------------------------------------------------------------------------
4419
4420template <typename T>
4421static SIMD_INLINE Vec<T, 64> bit_andnot(const Vec<T, 64> &a,
4422 const Vec<T, 64> &b)
4423{
4424 // reinterpret as byte for float and double versions
4425 const Vec<Byte, 64> res = _mm512_andnot_si512(
4426 reinterpret(a, OutputType<Byte>()), reinterpret(b, OutputType<Byte>()));
4427 return reinterpret(res, OutputType<T>());
4428}
4429
4430// ---------------------------------------------------------------------------
4431// bit_xor v
4432// ---------------------------------------------------------------------------
4433
4434template <typename T>
4435static SIMD_INLINE Vec<T, 64> bit_xor(const Vec<T, 64> &a, const Vec<T, 64> &b)
4436{
4437 // reinterpret as byte for float and double versions
4438 const Vec<Byte, 64> res = _mm512_xor_si512(
4439 reinterpret(a, OutputType<Byte>()), reinterpret(b, OutputType<Byte>()));
4440 return reinterpret(res, OutputType<T>());
4441}
4442
4443// ---------------------------------------------------------------------------
4444// bit_not v
4445// ---------------------------------------------------------------------------
4446
4447// all integer versions
4448template <typename T>
4449static SIMD_INLINE Vec<T, 64> bit_not(const Vec<T, 64> &a)
4450{
4451 // reinterpret as byte for float and double versions
4452 // from Agner Fog's VCL vectori256.h operator ~
4453 const Vec<Byte, 64> res =
4454 _mm512_xor_si512(reinterpret(a, OutputType<Byte>()), _mm512_set1_epi32(-1));
4455 return reinterpret(res, OutputType<T>());
4456}
4457
4458// ---------------------------------------------------------------------------
4459// avg: average with rounding down v
4460// ---------------------------------------------------------------------------
4461
4462#ifdef __AVX512BW__
4463
4464static SIMD_INLINE Vec<Byte, 64> avg(const Vec<Byte, 64> &a,
4465 const Vec<Byte, 64> &b)
4466{
4467 return _mm512_avg_epu8(a, b);
4468}
4469
4470// Paul R at
4471// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4472static SIMD_INLINE Vec<SignedByte, 64> avg(const Vec<SignedByte, 64> &a,
4473 const Vec<SignedByte, 64> &b)
4474{
4475 // from Agner Fog's VCL vectori128.h
4476 const __m512i signbit = _mm512_set1_epi8(int8_t(0x80));
4477 const __m512i a1 = _mm512_xor_si512(a, signbit); // add 0x80
4478 const __m512i b1 = _mm512_xor_si512(b, signbit); // add 0x80
4479 const __m512i m1 = _mm512_avg_epu8(a1, b1); // unsigned avg
4480 return _mm512_xor_si512(m1, signbit); // sub 0x80
4481}
4482
4483static SIMD_INLINE Vec<Word, 64> avg(const Vec<Word, 64> &a,
4484 const Vec<Word, 64> &b)
4485{
4486 return _mm512_avg_epu16(a, b);
4487}
4488
4489// Paul R at
4490// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4491static SIMD_INLINE Vec<Short, 64> avg(const Vec<Short, 64> &a,
4492 const Vec<Short, 64> &b)
4493{
4494 // from Agner Fog's VCL vectori128.h
4495 const __m512i signbit = _mm512_set1_epi16(int16_t(0x8000));
4496 const __m512i a1 = _mm512_xor_si512(a, signbit); // add 0x8000
4497 const __m512i b1 = _mm512_xor_si512(b, signbit); // add 0x8000
4498 const __m512i m1 = _mm512_avg_epu16(a1, b1); // unsigned avg
4499 return _mm512_xor_si512(m1, signbit); // sub 0x8000
4500}
4501
4502#else
4503
4504// non-avx512bw workaround
4505template <typename T>
4506static SIMD_INLINE Vec<T, 64> avg(const Vec<T, 64> &a, const Vec<T, 64> &b)
4507{
4508 return Vec<T, 64>(avg(a.lo(), b.lo()), avg(a.hi(), b.hi()));
4509}
4510
4511#endif
4512
4513// Paul R at
4514// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4515static SIMD_INLINE Vec<Int, 64> avg(const Vec<Int, 64> &a,
4516 const Vec<Int, 64> &b)
4517{
4518 const auto halfA = _mm512_srai_epi32(a, 1);
4519 const auto halfB = _mm512_srai_epi32(b, 1);
4520 const auto sum = _mm512_add_epi32(halfA, halfB);
4521 const auto lsb =
4522 _mm512_and_si512(_mm512_or_si512(a, b), _mm512_set1_epi32(1));
4523 return _mm512_add_epi32(lsb, sum);
4524}
4525
4526// Paul R at
4527// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4528static SIMD_INLINE Vec<Long, 64> avg(const Vec<Long, 64> &a,
4529 const Vec<Long, 64> &b)
4530{
4531 const auto halfA = _mm512_srai_epi64(a, 1);
4532 const auto halfB = _mm512_srai_epi64(b, 1);
4533 const auto sum = _mm512_add_epi64(halfA, halfB);
4534 const auto lsb =
4535 _mm512_and_si512(_mm512_or_si512(a, b), _mm512_set1_epi64(1));
4536 return _mm512_add_epi64(lsb, sum);
4537}
4538
4539// NOTE: Float version doesn't round!
4540static SIMD_INLINE Vec<Float, 64> avg(const Vec<Float, 64> &a,
4541 const Vec<Float, 64> &b)
4542{
4543 return _mm512_mul_ps(_mm512_add_ps(a, b), _mm512_set1_ps(0.5f));
4544}
4545
4546// NOTE: Double version doesn't round!
4547static SIMD_INLINE Vec<Double, 64> avg(const Vec<Double, 64> &a,
4548 const Vec<Double, 64> &b)
4549{
4550 return _mm512_mul_pd(_mm512_add_pd(a, b), _mm512_set1_pd(0.5));
4551}
4552
4553// ---------------------------------------------------------------------------
4554// test_all_zeros v
4555// ---------------------------------------------------------------------------
4556
4557template <typename T>
4558static SIMD_INLINE bool test_all_zeros(const Vec<T, 64> &a)
4559{
4560 const auto intA = reinterpret(a, OutputType<Int>());
4561 return _mm512_test_epi32_mask(intA, intA) == 0;
4562}
4563
4564// ---------------------------------------------------------------------------
4565// test_all_ones v
4566// ---------------------------------------------------------------------------
4567
4568// description of testn intrinsics was not clear, chosen other way
4569// note: contrary to IEEE 754, this function considers -0.0f to be negative
4570template <typename T>
4571static SIMD_INLINE bool test_all_ones(const Vec<T, 64> &a)
4572{
4573 return test_all_zeros(bit_not(a));
4574}
4575
4576// ---------------------------------------------------------------------------
4577// reverse
4578// ---------------------------------------------------------------------------
4579
4580template <typename T, SIMD_ENABLE_IF(sizeof(T) <= 2)>
4581static SIMD_INLINE Vec<T, 64> reverse(const Vec<T, 64> &a)
4582{
4583 __m512i mask;
4584 SIMD_IF_CONSTEXPR (sizeof(T) == 1) {
4585 mask = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4586 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
4587 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
4588 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
4589 55, 56, 57, 58, 59, 60, 61, 62, 63);
4590 } else {
4591 mask = _mm512_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
4592 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29,
4593 28, 31, 30, 33, 32, 35, 34, 37, 36, 39, 38, 41, 40,
4594 43, 42, 45, 44, 47, 46, 49, 48, 51, 50, 53, 52, 55,
4595 54, 57, 56, 59, 58, 61, 60, 63, 62);
4596 }
4597#ifdef __AVX512VBMI__
4598 return _mm512_permutexvar_epi8(mask, a);
4599#else
4600 const Vec<T, 64> r = x_mm512_shuffle_epi8(a, mask);
4601 return _mm512_permutexvar_epi64(_mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6), r);
4602#endif
4603}
4604
4605static SIMD_INLINE Vec<Int, 64> reverse(const Vec<Int, 64> &a)
4606{
4607 const auto mask =
4608 _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4609 return _mm512_permutexvar_epi32(mask, a);
4610}
4611
4612static SIMD_INLINE Vec<Long, 64> reverse(const Vec<Long, 64> &a)
4613{
4614 return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
4615}
4616
4617// float version, slightly changed int version
4618static SIMD_INLINE Vec<Float, 64> reverse(const Vec<Float, 64> &a)
4619{
4620 const auto mask =
4621 _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4622 return _mm512_permutexvar_ps(mask, a);
4623}
4624
4625// double version
4626static SIMD_INLINE Vec<Double, 64> reverse(const Vec<Double, 64> &a)
4627{
4628 return _mm512_permutexvar_pd(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
4629}
4630
4631// ---------------------------------------------------------------------------
4632// msb2int
4633// ---------------------------------------------------------------------------
4634
4635// 27. Aug 22 (Jonas Keller): added msb2int functions
4636
4637static SIMD_INLINE uint64_t msb2int(const Vec<Int, 64> &a)
4638{
4639#ifdef __AVX512DQ__
4640 return _mm512_movepi32_mask(a);
4641#else
4642 const __m512i mask = _mm512_set1_epi32(uint32_t(0x80000000));
4643 return _mm512_test_epi32_mask(a, mask);
4644#endif
4645}
4646
4647static SIMD_INLINE uint64_t msb2int(const Vec<Long, 64> &a)
4648{
4649#ifdef __AVX512DQ__
4650 return _mm512_movepi64_mask(a);
4651#else
4652 const __m512i mask = _mm512_set1_epi64(uint64_t(0x8000000000000000));
4653 return _mm512_test_epi64_mask(a, mask);
4654#endif
4655}
4656
4657static SIMD_INLINE uint64_t msb2int(const Vec<Float, 64> &a)
4658{
4659#ifdef __AVX512DQ__
4660 return _mm512_movepi32_mask(_mm512_castps_si512(a));
4661#else
4662 const __m512i mask = _mm512_set1_epi32(0x80000000);
4663 return _mm512_test_epi32_mask(_mm512_castps_si512(a), mask);
4664#endif
4665}
4666
4667static SIMD_INLINE uint64_t msb2int(const Vec<Double, 64> &a)
4668{
4669#ifdef __AVX512DQ__
4670 return uint64_t(_mm512_movepi64_mask(_mm512_castpd_si512(a)));
4671#else
4672
4673 const __m512i mask = _mm512_set1_epi64(0x8000000000000000);
4674 // _cvtmask8_u32 requires AVX512DQ, so just convert using implicit conversion
4675 return _mm512_test_epi64_mask(_mm512_castpd_si512(a), mask);
4676#endif
4677}
4678
4679// from:
4680// https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/
4681static SIMD_INLINE uint64_t interleave_uint32_with_zeros(uint32_t input)
4682{
4683 uint64_t word = input;
4684 word = (word ^ (word << 16)) & 0x0000ffff0000ffff;
4685 word = (word ^ (word << 8)) & 0x00ff00ff00ff00ff;
4686 word = (word ^ (word << 4)) & 0x0f0f0f0f0f0f0f0f;
4687 word = (word ^ (word << 2)) & 0x3333333333333333;
4688 word = (word ^ (word << 1)) & 0x5555555555555555;
4689 return word;
4690}
4691
4692static SIMD_INLINE uint64_t msb2int(const Vec<Byte, 64> &a)
4693{
4694#ifdef __AVX512BW__
4695 return _mm512_movepi8_mask(a);
4696#else
4697 const uint64_t part3 = msb2int(reinterpret(a, OutputType<Int>()));
4698 const uint64_t part2 = msb2int(reinterpret(slle<1>(a), OutputType<Int>()));
4699 const uint64_t part1 = msb2int(reinterpret(slle<2>(a), OutputType<Int>()));
4700 const uint64_t part0 = msb2int(reinterpret(slle<3>(a), OutputType<Int>()));
4701 // TODO: is there a more efficient way to interleave with 3 zeros instead of
4702 // interleaving with 1 zero twice?
4703 const uint64_t part3_with_zeros =
4704 interleave_uint32_with_zeros(interleave_uint32_with_zeros(part3));
4705 const uint64_t part2_with_zeros =
4706 interleave_uint32_with_zeros(interleave_uint32_with_zeros(part2));
4707 const uint64_t part1_with_zeros =
4708 interleave_uint32_with_zeros(interleave_uint32_with_zeros(part1));
4709 const uint64_t part0_with_zeros =
4710 interleave_uint32_with_zeros(interleave_uint32_with_zeros(part0));
4711 return part0_with_zeros | (part1_with_zeros << 1) | (part2_with_zeros << 2) |
4712 (part3_with_zeros << 3);
4713#endif
4714}
4715
4716static SIMD_INLINE uint64_t msb2int(const Vec<SignedByte, 64> &a)
4717{
4718 return msb2int(reinterpret(a, OutputType<Byte>()));
4719}
4720
4721static SIMD_INLINE uint64_t msb2int(const Vec<Short, 64> &a)
4722{
4723#ifdef __AVX512BW__
4724 return _mm512_movepi16_mask(a);
4725#else
4726 const uint64_t odd = msb2int(reinterpret(a, OutputType<Int>()));
4727 const uint64_t even = msb2int(reinterpret(slle<1>(a), OutputType<Int>()));
4728 return interleave_uint32_with_zeros(even) |
4729 (interleave_uint32_with_zeros(odd) << 1);
4730#endif
4731}
4732
4733static SIMD_INLINE uint64_t msb2int(const Vec<Word, 64> &a)
4734{
4735 return msb2int(reinterpret(a, OutputType<Short>()));
4736}
4737
4738// ---------------------------------------------------------------------------
4739// int2msb
4740// ---------------------------------------------------------------------------
4741
4742// 06. Oct 22 (Jonas Keller): added int2msb functions
4743
4744static SIMD_INLINE Vec<Byte, 64> int2msb(const uint64_t a, OutputType<Byte>,
4745 Integer<64>)
4746{
4747#ifdef __AVX512BW__
4748 return _mm512_maskz_set1_epi8(__mmask64(a), (int8_t) 0x80);
4749#else
4750 const __m256i shuffleIndeces = _mm256_set_epi64x(
4751 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
4752 const __m256i aVecLo =
4753 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
4754 const __m256i aVecHi =
4755 _mm256_shuffle_epi8(_mm256_set1_epi32(a >> 32), shuffleIndeces);
4756 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
4757 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4758 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4759 const __m256i resultLo = _mm256_cmpeq_epi8(selectedLo, sel);
4760 const __m256i resultHi = _mm256_cmpeq_epi8(selectedHi, sel);
4761 const __m512i result =
4762 _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4763 return _mm512_and_si512(result, _mm512_set1_epi32(0x80808080));
4764#endif
4765}
4766
4767static SIMD_INLINE Vec<SignedByte, 64> int2msb(const uint64_t a,
4768 OutputType<SignedByte>,
4769 Integer<64>)
4770{
4771 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<64>()),
4772 OutputType<SignedByte>());
4773}
4774
4775static SIMD_INLINE Vec<Short, 64> int2msb(const uint64_t a, OutputType<Short>,
4776 Integer<64>)
4777{
4778#ifdef __AVX512BW__
4779 return _mm512_maskz_set1_epi16(__mmask32(a), (int16_t) 0x8000);
4780#else
4781 const __m256i sel = _mm256_set_epi16(
4782 (int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
4783 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
4784 const __m256i aVecLo = _mm256_set1_epi16(a);
4785 const __m256i aVecHi = _mm256_set1_epi16(a >> 16);
4786 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4787 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4788 const __m256i resultLo = _mm256_cmpeq_epi16(selectedLo, sel);
4789 const __m256i resultHi = _mm256_cmpeq_epi16(selectedHi, sel);
4790 const __m512i result =
4791 _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4792 return _mm512_and_si512(result, _mm512_set1_epi32(0x80008000));
4793#endif
4794}
4795
4796static SIMD_INLINE Vec<Word, 64> int2msb(const uint64_t a, OutputType<Word>,
4797 Integer<64>)
4798{
4799 return reinterpret(int2msb(a, OutputType<Short>(), Integer<64>()),
4800 OutputType<Word>());
4801}
4802
4803static SIMD_INLINE Vec<Int, 64> int2msb(const uint64_t a, OutputType<Int>,
4804 Integer<64>)
4805{
4806 return _mm512_maskz_set1_epi32(__mmask16(a), 0x80000000);
4807}
4808
4809static SIMD_INLINE Vec<Long, 64> int2msb(const uint64_t a, OutputType<Long>,
4810 Integer<64>)
4811{
4812 return _mm512_maskz_set1_epi64(__mmask8(a), 0x8000000000000000);
4813}
4814
4815static SIMD_INLINE Vec<Float, 64> int2msb(const uint64_t a, OutputType<Float>,
4816 Integer<64>)
4817{
4818 return reinterpret(int2msb(a, OutputType<Int>(), Integer<64>()),
4819 OutputType<Float>());
4820}
4821
4822static SIMD_INLINE Vec<Double, 64> int2msb(const uint64_t a, OutputType<Double>,
4823 Integer<64>)
4824{
4825 return _mm512_castsi512_pd(
4826 _mm512_maskz_set1_epi64(__mmask8(a), 0x8000000000000000));
4827}
4828
4829// ---------------------------------------------------------------------------
4830// int2bits
4831// ---------------------------------------------------------------------------
4832
4833// 09. Oct 22 (Jonas Keller): added int2bits functions
4834
4835static SIMD_INLINE Vec<Byte, 64> int2bits(const uint64_t a, OutputType<Byte>,
4836 Integer<64>)
4837{
4838#ifdef __AVX512BW__
4839 return _mm512_maskz_set1_epi8(__mmask64(a), (int8_t) 0xff);
4840#else
4841 const __m256i shuffleIndeces = _mm256_set_epi64x(
4842 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
4843 const __m256i aVecLo =
4844 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
4845 const __m256i aVecHi =
4846 _mm256_shuffle_epi8(_mm256_set1_epi32(a >> 32), shuffleIndeces);
4847 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
4848 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4849 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4850 const __m256i resultLo = _mm256_cmpeq_epi8(selectedLo, sel);
4851 const __m256i resultHi = _mm256_cmpeq_epi8(selectedHi, sel);
4852 return _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4853#endif
4854}
4855
4856static SIMD_INLINE Vec<SignedByte, 64> int2bits(const uint64_t a,
4857 OutputType<SignedByte>,
4858 Integer<64>)
4859{
4860 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<64>()),
4861 OutputType<SignedByte>());
4862}
4863
4864static SIMD_INLINE Vec<Short, 64> int2bits(const uint64_t a, OutputType<Short>,
4865 Integer<64>)
4866{
4867#ifdef __AVX512BW__
4868 return _mm512_maskz_set1_epi16(__mmask32(a), (int16_t) 0xffff);
4869#else
4870 const __m256i sel = _mm256_set_epi16(
4871 (int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
4872 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
4873 const __m256i aVecLo = _mm256_set1_epi16(a);
4874 const __m256i aVecHi = _mm256_set1_epi16(a >> 16);
4875 const __m256i selectedLo = _mm256_and_si256(aVecLo, sel);
4876 const __m256i selectedHi = _mm256_and_si256(aVecHi, sel);
4877 const __m256i resultLo = _mm256_cmpeq_epi16(selectedLo, sel);
4878 const __m256i resultHi = _mm256_cmpeq_epi16(selectedHi, sel);
4879 return _mm512_inserti64x4(_mm512_castsi256_si512(resultLo), resultHi, 1);
4880#endif
4881}
4882
4883static SIMD_INLINE Vec<Word, 64> int2bits(const uint64_t a, OutputType<Word>,
4884 Integer<64>)
4885{
4886 return reinterpret(int2bits(a, OutputType<Short>(), Integer<64>()),
4887 OutputType<Word>());
4888}
4889
4890static SIMD_INLINE Vec<Int, 64> int2bits(const uint64_t a, OutputType<Int>,
4891 Integer<64>)
4892{
4893 return _mm512_maskz_set1_epi32(__mmask16(a), 0xffffffff);
4894}
4895
4896static SIMD_INLINE Vec<Long, 64> int2bits(const uint64_t a, OutputType<Long>,
4897 Integer<64>)
4898{
4899 return _mm512_maskz_set1_epi64(__mmask8(a), 0xffffffffffffffff);
4900}
4901
4902static SIMD_INLINE Vec<Float, 64> int2bits(const uint64_t a, OutputType<Float>,
4903 Integer<64>)
4904{
4905 return reinterpret(int2bits(a, OutputType<Int>(), Integer<64>()),
4906 OutputType<Float>());
4907}
4908
4909static SIMD_INLINE Vec<Double, 64> int2bits(const uint64_t a,
4910 OutputType<Double>, Integer<64>)
4911{
4912 return _mm512_castsi512_pd(
4913 _mm512_maskz_set1_epi64(__mmask8(a), 0xffffffffffffffff));
4914}
4915
4916// ---------------------------------------------------------------------------
4917// iota
4918// ---------------------------------------------------------------------------
4919
4920// 30. Jan 23 (Jonas Keller): added iota
4921
4922static SIMD_INLINE Vec<Byte, 64> iota(OutputType<Byte>, Integer<64>)
4923{
4924 return _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50,
4925 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,
4926 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22,
4927 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,
4928 7, 6, 5, 4, 3, 2, 1, 0);
4929}
4930
4931static SIMD_INLINE Vec<SignedByte, 64> iota(OutputType<SignedByte>, Integer<64>)
4932{
4933 return _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50,
4934 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,
4935 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22,
4936 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,
4937 7, 6, 5, 4, 3, 2, 1, 0);
4938}
4939
4940static SIMD_INLINE Vec<Short, 64> iota(OutputType<Short>, Integer<64>)
4941{
4942 return _mm512_set_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
4943 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
4944 3, 2, 1, 0);
4945}
4946
4947static SIMD_INLINE Vec<Word, 64> iota(OutputType<Word>, Integer<64>)
4948{
4949 return _mm512_set_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
4950 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
4951 3, 2, 1, 0);
4952}
4953
4954static SIMD_INLINE Vec<Int, 64> iota(OutputType<Int>, Integer<64>)
4955{
4956 return _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4957}
4958
4959static SIMD_INLINE Vec<Long, 64> iota(OutputType<Long>, Integer<64>)
4960{
4961 return _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
4962}
4963
4964static SIMD_INLINE Vec<Float, 64> iota(OutputType<Float>, Integer<64>)
4965{
4966 return _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
4967 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
4968}
4969
4970static SIMD_INLINE Vec<Double, 64> iota(OutputType<Double>, Integer<64>)
4971{
4972 return _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
4973}
4974
4975} // namespace base
4976} // namespace internal
4977} // namespace simd
4978
4979#endif
4980
4981#endif // SIMD_VEC_BASE_IMPL_INTEL_64_H_
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161