T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
base_impl_intel32.H
1// ===========================================================================
2//
3// encapsulation for AVX/AVX2 Intel vector extensions
4// inspired by Agner Fog's C++ Vector Class Library
5// http://www.agner.org/optimize/#vectorclass
6// (VCL License: GNU General Public License Version 3,
7// http://www.gnu.org/licenses/gpl-3.0.en.html)
8//
9// This source code file is part of the following software:
10//
11// - the low-level C++ template SIMD library
12// - the SIMD implementation of the MinWarping and the 2D-Warping methods
13// for local visual homing.
14//
15// The software is provided based on the accompanying license agreement in the
16// file LICENSE.md.
17// The software is provided "as is" without any warranty by the licensor and
18// without any liability of the licensor, and the software may not be
19// distributed by the licensee; see the license agreement for details.
20//
21// (C) Ralf Möller
22// Computer Engineering
23// Faculty of Technology
24// Bielefeld University
25// www.ti.uni-bielefeld.de
26//
27// ===========================================================================
28
29// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
30// namespace
31// 13. May 23 (Jonas Keller): added Double support
32
33#pragma once
34#ifndef SIMD_VEC_BASE_IMPL_INTEL_32_H_
35#define SIMD_VEC_BASE_IMPL_INTEL_32_H_
36
37#include "../alloc.H"
38#include "../defs.H"
39#include "../types.H"
40#include "../vec.H"
41#include "base_impl_intel16.H"
42#include "intrins_intel.H"
43
44#include <cstddef>
45#include <cstdint>
46#include <limits>
47#include <type_traits>
48
49#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_32_AVAIL_) && \
50 !defined(SIMDVEC_SANDBOX)
51
52namespace simd {
53
54// ===========================================================================
55// NOTES:
56//
57// - setting zero inside the function is not inefficient, see:
58// http://stackoverflow.com/questions/26807285/...
59// ...are-static-static-local-sse-avx-variables-blocking-a-xmm-ymm-register
60//
61// - for some data types (Int, Float) there are no saturated versions
62// of add/sub instructions; in this case we use the unsaturated version;
63// the user is responsible to avoid overflows
64//
65// - _mm512_alignr_epi32/64 are *not* lane-oriented and could be a better
66// solution than the _epi8 version which *is* lane-oriented
67//
68// - should we replace set1 with broadcast? probably the compiler
69// generates broadcast anyhow? apparently not without -O3!
70//
71// - we could improve performance by using 256-bit instructions from
72// AVX512-VL (e.g. permute instructions); at the moment the idea is that
73// typically the widest vector width is used, so if AVX512 is available,
74// AVX/AVX2 would only rarely be used
75//
76// ===========================================================================
77
78// ===========================================================================
79// Vec integer specialization for AVX2
80// ===========================================================================
81
82// partial specialization for SIMD_WIDTH = 32
83template <typename T>
84class Vec<T, 32>
85{
86 __m256i ymm = _mm256_setzero_si256();
87
88public:
89 using Type = T;
90 static constexpr size_t elements = 32 / sizeof(T);
91 static constexpr size_t elems = elements;
92 static constexpr size_t bytes = 32;
93
94 Vec() = default;
95 Vec(const __m256i &x) { ymm = x; }
96 Vec &operator=(const __m256i &x)
97 {
98 ymm = x;
99 return *this;
100 }
101 operator __m256i() const { return ymm; }
102 // for avx2 emulation
103 Vec(const Vec<T, 16> &lo, const Vec<T, 16> &hi)
104 {
105 ymm = _mm256_set_m128i(hi, lo);
106 }
107 SIMD_INLINE Vec<T, 16> lo() const { return _mm256_castsi256_si128(ymm); }
108 SIMD_INLINE Vec<T, 16> hi() const { return _mm256_extractf128_si256(ymm, 1); }
109 // 29. Nov 22 (Jonas Keller):
110 // defined operators new and delete to ensure proper alignment, since
111 // the default new and delete are not guaranteed to do so before C++17
112 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
113 void operator delete(void *p) { aligned_free(p); }
114 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
115 void operator delete[](void *p) { aligned_free(p); }
116 // 05. Sep 23 (Jonas Keller): added allocator
117 using allocator = aligned_allocator<Vec<T, bytes>, bytes>;
118};
119
120// ===========================================================================
121// Vec float specialization for AVX
122// ===========================================================================
123
124template <>
125class Vec<Float, 32>
126{
127 __m256 ymm = _mm256_setzero_ps();
128
129public:
130 using Type = Float;
131 static constexpr size_t elements = 32 / sizeof(Float);
132 static constexpr size_t elems = elements;
133 static constexpr size_t bytes = 32;
134
135 Vec() = default;
136 Vec(const __m256 &x) { ymm = x; }
137 Vec &operator=(const __m256 &x)
138 {
139 ymm = x;
140 return *this;
141 }
142 operator __m256() const { return ymm; }
143 // for avx2 emulation
144 Vec(const Vec<Float, 16> &lo, const Vec<Float, 16> &hi)
145 {
146 ymm = _mm256_set_m128(hi, lo);
147 }
148 SIMD_INLINE Vec<Float, 16> lo() const { return _mm256_castps256_ps128(ymm); }
149 SIMD_INLINE Vec<Float, 16> hi() const
150 {
151 return _mm256_extractf128_ps(ymm, 1);
152 }
153 // 29. Nov 22 (Jonas Keller):
154 // defined operators new and delete to ensure proper alignment, since
155 // the default new and delete are not guaranteed to do so before C++17
156 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
157 void operator delete(void *p) { aligned_free(p); }
158 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
159 void operator delete[](void *p) { aligned_free(p); }
160 // 05. Sep 23 (Jonas Keller): added allocator
161 using allocator = aligned_allocator<Vec<Float, bytes>, bytes>;
162};
163
164// ===========================================================================
165// Vec double specialization for AVX
166// ===========================================================================
167
168template <>
169class Vec<Double, 32>
170{
171 __m256d ymm;
172
173public:
174 using Type = Double;
175 static constexpr size_t elements = 32 / sizeof(Double);
176 static constexpr size_t elems = elements;
177 static constexpr size_t bytes = 32;
178
179 Vec() = default;
180 Vec(const __m256d &x) { ymm = x; }
181 Vec &operator=(const __m256d &x)
182 {
183 ymm = x;
184 return *this;
185 }
186 operator __m256d() const { return ymm; }
187 // for avx2 emulation
188 Vec(const Vec<Double, 16> &lo, const Vec<Double, 16> &hi)
189 {
190 ymm = _mm256_set_m128d(hi, lo);
191 }
192 SIMD_INLINE Vec<Double, 16> lo() const { return _mm256_castpd256_pd128(ymm); }
193 SIMD_INLINE Vec<Double, 16> hi() const
194 {
195 return _mm256_extractf128_pd(ymm, 1);
196 }
197 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
198 void operator delete(void *p) { aligned_free(p); }
199 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
200 void operator delete[](void *p) { aligned_free(p); }
201 using allocator = aligned_allocator<Vec<Double, bytes>, bytes>;
202};
203
204namespace internal {
205namespace base {
206// ===========================================================================
207// auxiliary functions
208// ===========================================================================
209
210// These functions either wrap AVX intrinsics (e.g. to handle
211// immediate arguments as template parameter), or switch between
212// implementations with different AVX* extensions, or provide
213// altered or additional functionality.
214// Only for use in wrapper functions!
215
216// 01. Apr 23 (Jonas Keller): removed some not really necessary internal
217// wrapper functions and inlined them directly into where they were used
218
219// ---------------------------------------------------------------------------
220// swizzle_32_16: swizzling of 128-bit lanes (for swizzle)
221// ---------------------------------------------------------------------------
222
223// rearrange vectors such that lane-oriented processing finds the
224// right vectors to combine in corresponding lanes
225//
226// example: (li,hi are lanes)
227//
228// --v0- --v1- --v2-
229// N=3: l0 h0 l1 h1 l2 h2
230// -- --
231// -- --
232// -- --
233// -> l0 h1 h0 l2 l1 h2 (distance = 3 lanes)
234// a0 b1 I=0, a=v0, b=v1
235// a1 b0 I=1, a=v0, b=v1
236// a0 b1 I=2, a=v1, b=v2
237//
238// --v0- --v1- --v2- --v3-
239// N=4: l0 h0 l1 h1 l2 h2 l3 h3
240// -- --
241// -- --
242// -- --
243// -- --
244// -> l0 l2 h0 h2 l1 l3 h1 h3 (distance = 4 lanes)
245// a0 b0 I=0, a=v0, b=v2
246// a1 b0 I=1, a=v0, b=v2
247// a0 b1 I=2, a=v1, b=v3
248// a1 b1 I=3, a=v1, b=v3
249
250// primary template
251template <size_t N, size_t I = 0>
252struct Swizzle_32_16
253{
254 template <typename T>
255 static SIMD_INLINE void _swizzle_32_16(const Vec<T, 32> vIn[N],
256 Vec<T, 32> vOut[N])
257 {
258 // example: N=3 v v
259 // I=0: permute_32_16(vIn[0], vIn[1], _MM_SHUFFLE(0, 2+ 1, 0, 0));
260 // I=1: permute_32_16(vIn[0], vIn[2], _MM_SHUFFLE(0, 2+ 0, 0, 1));
261 // I=2: permute_32_16(vIn[1], vIn[2], _MM_SHUFFLE(0, 2+ 1, 0, 0));
262 //
263 // example: N=4: v v
264 // I=0: permute_32_16(vIn[0], vIn[2], _MM_SHUFFLE(0, 2+ 0, 0, 0));
265 // I=1: permute_32_16(vIn[0], vIn[2], _MM_SHUFFLE(0, 2+ 1, 0, 0));
266 // I=2: permute_32_16(vIn[1], vIn[3], _MM_SHUFFLE(0, 2+ 0, 0, 1));
267 // I=3: permute_32_16(vIn[1], vIn[3], _MM_SHUFFLE(0, 2+ 1, 0, 1));
268 //
269 // "2+" means: take from second vector
270 vOut[I] =
271 _mm256_permute2f128_si256(vIn[I / 2], vIn[(I + N) / 2],
272 _MM_SHUFFLE(0, (2 + (I + N) % 2), 0, (I % 2)));
273 Swizzle_32_16<N, I + 1>::_swizzle_32_16(vIn, vOut);
274 }
275
276 // Float version
277 static SIMD_INLINE void _swizzle_32_16(const Vec<Float, 32> vIn[N],
278 Vec<Float, 32> vOut[N])
279 {
280 vOut[I] =
281 _mm256_permute2f128_ps(vIn[I / 2], vIn[(I + N) / 2],
282 _MM_SHUFFLE(0, (2 + (I + N) % 2), 0, (I % 2)));
283 Swizzle_32_16<N, I + 1>::_swizzle_32_16(vIn, vOut);
284 }
285
286 // Double version
287 static SIMD_INLINE void _swizzle_32_16(const Vec<Double, 32> vIn[N],
288 Vec<Double, 32> vOut[N])
289 {
290 vOut[I] =
291 _mm256_permute2f128_pd(vIn[I / 2], vIn[(I + N) / 2],
292 _MM_SHUFFLE(0, (2 + (I + N) % 2), 0, (I % 2)));
293 Swizzle_32_16<N, I + 1>::_swizzle_32_16(vIn, vOut);
294 }
295};
296
297// termination
298template <size_t N>
299struct Swizzle_32_16<N, N>
300{
301 template <typename T>
302 static SIMD_INLINE void _swizzle_32_16(const Vec<T, 32>[N], Vec<T, 32>[N])
303 {}
304};
305
306// swizzle lanes (for implementation of swizzle functions)
307// from Stan Melax: 3D Vector Normalization... (adapted)
308template <size_t N, typename T>
309static SIMD_INLINE void swizzle_32_16(const Vec<T, 32> vIn[N],
310 Vec<T, 32> vOut[N])
311{
312 Swizzle_32_16<N>::_swizzle_32_16(vIn, vOut);
313}
314
315// ---------------------------------------------------------------------------
316// alignr
317// ---------------------------------------------------------------------------
318
319// 21. Apr 23 (Jonas Keller): replaced IMM range handling via tag dispatch
320// with static_assert, since we don't need the range handling anymore,
321// we just assert that IMM is in range
322
323template <size_t COUNT>
324static SIMD_INLINE __m256i x_mm256_alignr_epi8(__m256i h, __m256i l)
325{
326 // 2. Jul 18 (rm) BUGFIX: 64 -> 32 (2 lanes only, lane-oriented!)
327 static_assert(COUNT < 32, "");
328#ifdef __AVX2__
329 return _mm256_alignr_epi8(h, l, COUNT);
330#else
331 // non-avx2 workaround
332 // (easy since AVX2 instructions operate on lanes anyhow)
333 return _mm256_set_m128i(_mm_alignr_epi8(_mm256_extractf128_si256(h, 1),
334 _mm256_extractf128_si256(l, 1),
335 COUNT),
336 _mm_alignr_epi8(_mm256_castsi256_si128(h),
337 _mm256_castsi256_si128(l), COUNT));
338
339#endif
340}
341
342// ---------------------------------------------------------------------------
343// auxiliary function for right shift over full 32 byte
344// ---------------------------------------------------------------------------
345
346// (difficulty: _mm256_srli_si256 only works in 128-bit lanes)
347// http://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
348// TODO: finer case distinction using permute4x64?
349
350// 7. Jun 16 (rm): if replaced by tag dispatching
351// (reason: all branches are compiles and at least icc complains
352// about exceeded ranges in immediates)
353
354// COUNT = 0
355template <size_t COUNT>
356static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a, Range<true, 0, 16>)
357{
358 return a;
359}
360
361// COUNT = 1..15
362template <size_t COUNT>
363static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a, Range<false, 0, 16>)
364{
365 // _MM_SHUFFLE(2,0, 0,1) = 0x81, MS-bit set -> setting elements to zero
366 // higher lane set to zero (2,0), lower lane taken from higher lane (0,1)
367 // a: HHHHHHHHhhhhhhhh LLLLLLLllllllll
368 // _0h: 0000000000000000 HHHHHHHhhhhhhhh (2,0) (0,1)
369 __m256i _0h = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1));
370 // e.g. COUNT=5
371 // a: HHHHHHHHhhhhhhhh LLLLLLLllllllll
372 // _0h: 0000000000000000 HHHHHHHhhhhhhhh
373 // alignr H lane: 0000000000000000 HHHHHHHHhhhhhhh
374 // selected: ----- -----------
375 // alignr L lane: HHHHHHHHhhhhhhhh LLLLLLLLlllllll
376 // selected: ----- -----------
377 // alignr: 00000HHHHHHHHhhh hhhhhLLLLLLLlll
378 return x_mm256_alignr_epi8<COUNT>(_0h, a);
379}
380
381// COUNT = 16
382template <size_t COUNT>
383static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a, Range<true, 16, 32>)
384{
385 // _MM_SHUFFLE(2,0, 0,1) = 0x81, MS-bit set -> setting elements to zero
386 // higher lane set to zero (2,0), lower lane taken from higher lane (0,1)
387 // a: HHHHHHHHhhhhhhhh LLLLLLLllllllll
388 // _0h: 0000000000000000 HHHHHHHhhhhhhhh (2,0) (0,1)
389 __m256i _0h = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1));
390 // _0h: 0000000000000000 HHHHHHHhhhhhhhh (2,0) (0,1)
391 return _0h;
392}
393
394// COUNT = 17..31
395template <size_t COUNT>
396static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a,
397 Range<false, 16, 32>)
398{
399 // _MM_SHUFFLE(2,0, 0,1) = 0x81, MS-bit set -> setting elements to zero
400 // higher lane set to zero (2,0), lower lane taken from higher lane (0,1)
401 // a: HHHHHHHHhhhhhhhh LLLLLLLllllllll
402 // _0h: 0000000000000000 HHHHHHHhhhhhhhh (2,0) (0,1)
403 __m256i _0h = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1));
404 // e.g. COUNT=18 (18-16 = 2)
405 // _0h: 0000000000000000 HHHHHHHhhhhhhhh
406 // srli: 0000000000000000 00HHHHHHHHhhhhh
407#ifdef __AVX2__
408 return _mm256_srli_si256(_0h, COUNT - 16);
409#else
410 return _mm256_set_m128i(
411 _mm_srli_si128(_mm256_extractf128_si256(_0h, 1), COUNT - 16),
412 _mm_srli_si128(_mm256_castsi256_si128(_0h), COUNT - 16));
413#endif
414}
415
416// COUNT >= 32
417template <size_t, bool AT_LOW_LIM, size_t LOW_LIM_INCL, size_t UP_LIM_EXCL>
418static SIMD_INLINE __m256i
419x_mm256_srli256_si256(__m256i, Range<AT_LOW_LIM, LOW_LIM_INCL, UP_LIM_EXCL>)
420{
421 return _mm256_setzero_si256();
422}
423
424// hub
425template <size_t COUNT>
426static SIMD_INLINE __m256i x_mm256_srli256_si256(__m256i a)
427{
428 return x_mm256_srli256_si256<COUNT>(a, SizeRange<COUNT, 16>());
429}
430
431// ---------------------------------------------------------------------------
432// auxiliary function for left shift over full 32 bytes
433// ---------------------------------------------------------------------------
434
435// http://stackoverflow.com/questions/25248766/
436// emulating-shifts-on-32-bytes-with-avx
437// TODO: finer case distinction using permute4x64?
438
439// 7. Jun 16 (rm): if replaced by tag dispatching
440// (reason: all branches are compiles and at least icc complains
441// about exceeded ranges in immediates)
442
443// COUNT = 0
444template <size_t COUNT>
445static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a, Range<true, 0, 16>)
446{
447 return a;
448}
449
450// COUNT = 1..15
451template <size_t COUNT>
452static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a, Range<false, 0, 16>)
453{
454 // _MM_SHUFFLE(0,0, 2,0) = 0x08, MS-bit set -> setting elements to zero
455 // higher lane taken from lower lane (0,0), lower lane set to zero (2,0)
456 // a: HHHHHHHHhhhhhhhh LLLLLLLLllllllll
457 // _l0: LLLLLLLLllllllll 0000000000000000 (0,0) (2,0)
458 __m256i _l0 = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0));
459 // e.g. COUNT = 5: (16-5=11)
460 // _l0: LLLLLLLLllllllll 0000000000000000
461 // a: HHHHHHHHhhhhhhhh LLLLLLLLllllllll
462 // alignr H lane: HHHHHHHHhhhhhhhh LLLLLLLLllllllll
463 // selected: ----------- -----
464 // alignr L lane: LLLLLLLLllllllll 0000000000000000
465 // selected: ----------- -----
466 // alignr: HHHhhhhhhhhLLLLL LLLllllllll00000
467 return x_mm256_alignr_epi8<16 - COUNT>(a, _l0);
468}
469
470// COUNT = 16
471template <size_t COUNT>
472static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a, Range<true, 16, 32>)
473{
474 // _MM_SHUFFLE(0,0, 2,0) = 0x08, MS-bit set -> setting elements to zero
475 // higher lane taken from lower lane (0,0), lower lane set to zero (2,0)
476 // a: HHHHHHHHhhhhhhhh LLLLLLLLllllllll
477 // _l0: LLLLLLLLllllllll 0000000000000000 (0,0) (2,0)
478 __m256i _l0 = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0));
479 // _l0: LLLLLLLLllllllll 0000000000000000 (0,0) (2,0)
480 return _l0;
481}
482
483// COUNT = 17..31
484template <size_t COUNT>
485static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a,
486 Range<false, 16, 32>)
487{
488 // _MM_SHUFFLE(0,0, 2,0) = 0x08, MS-bit set -> setting elements to zero
489 // higher lane taken from lower lane (0,0), lower lane set to zero (2,0)
490 // a: HHHHHHHHhhhhhhhh LLLLLLLLllllllll
491 // _l0: LLLLLLLLllllllll 0000000000000000 (0,0) (2,0)
492 __m256i _l0 = _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0));
493 // e.g. COUNT = 18 (18-16=2)
494 // _l0: LLLLLLLLllllllll 0000000000000000
495 // slri: LLLLLLllllllll00 0000000000000000
496#ifdef __AVX2__
497 return _mm256_slli_si256(_l0, COUNT - 16);
498#else
499 return _mm256_set_m128i(
500 _mm_slli_si128(_mm256_extractf128_si256(_l0, 1), COUNT - 16),
501 _mm_slli_si128(_mm256_castsi256_si128(_l0), COUNT - 16));
502#endif
503}
504
505// COUNT >= 32
506template <size_t, bool AT_LOW_LIM, size_t LOW_LIM_INCL, size_t UP_LIM_EXCL>
507static SIMD_INLINE __m256i
508x_mm256_slli256_si256(__m256i, Range<AT_LOW_LIM, LOW_LIM_INCL, UP_LIM_EXCL>)
509{
510 return _mm256_setzero_si256();
511}
512
513// hub
514template <size_t COUNT>
515static SIMD_INLINE __m256i x_mm256_slli256_si256(__m256i a)
516{
517 return x_mm256_slli256_si256<COUNT>(a, SizeRange<COUNT, 16>());
518}
519
520// ---------------------------------------------------------------------------
521// full 32 byte alignr ("alignr256")
522// ---------------------------------------------------------------------------
523
524// h: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
525// l: LLLLLLLLLLLLLLLL llllllllllllllll
526// 000 HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL llllllllllllllll
527// 0: ---------------- ----------------
528// 5: ------ ---------------- ----------
529// 16: ---------------- ----------------
530// 18: --- ---------------- -------------
531// 32: ---------------- ----------------
532// 35: --- ---------------- -------------
533
534// modified from emmanualLattia at
535// https://idz-smita-idzdev.ssgisp.com/fr-fr/forums/topic/500664
536
537// 7. Jun 16 (rm): if replaced by tag dispatching
538// (reason: all branches are compiles and at least icc complains
539// about exceeded ranges in immediates)
540
541// COUNT = 0
542template <size_t COUNT>
543static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i, __m256i low,
544 Range<true, 0, 16>)
545{
546 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
547 // low: LLLLLLLLLLLLLLLL llllllllllllllll
548 // COUNT == 0: LLLLLLLLLLLLLLLL llllllllllllllll
549 return low;
550}
551
552// COUNT = 1..15
553template <size_t COUNT>
554static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low,
555 Range<false, 0, 16>)
556{
557 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
558 // low: LLLLLLLLLLLLLLLL llllllllllllllll
559 // high0low1: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL (0,2) (0,1)
560 __m256i high0_low1 =
561 _mm256_permute2f128_si256(low, high, _MM_SHUFFLE(0, 2, 0, 1));
562 // e.g. COUNT = 5
563 // low: LLLLLLLLLLLLLLLL llllllllllllllll
564 // high0low1: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL (0,2) (0,1)
565 // alignr H lane: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL
566 // selected: ----- -----------
567 // alignr L lane: LLLLLLLLLLLLLLLL llllllllllllllll
568 // selected: ----- -----------
569 // alignr: hhhhhLLLLLLLLLLL LLLLLlllllllllll
570 return x_mm256_alignr_epi8<COUNT>(high0_low1, low);
571}
572
573// COUNT = 16
574template <size_t COUNT>
575static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low,
576 Range<true, 16, 32>)
577{
578 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
579 // low: LLLLLLLLLLLLLLLL llllllllllllllll
580 // high0low1: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL (0,2) (0,1)
581 __m256i high0_low1 =
582 _mm256_permute2f128_si256(low, high, _MM_SHUFFLE(0, 2, 0, 1));
583 // COUNT == 16: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL
584 return high0_low1;
585}
586
587// COUNT = 17..31
588template <size_t COUNT>
589static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low,
590 Range<false, 16, 32>)
591{
592 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
593 // low: LLLLLLLLLLLLLLLL llllllllllllllll
594 // high0low1: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL (0,2) (0,1)
595 __m256i high0_low1 =
596 _mm256_permute2f128_si256(low, high, _MM_SHUFFLE(0, 2, 0, 1));
597 // e.g. COUNT = 18 (COUNT - 16 = 2)
598 // high0low1: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL
599 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
600 // alignr H lane: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
601 // selected: -- --------------
602 // alignr L lane: hhhhhhhhhhhhhhhh LLLLLLLLLLLLLLLL
603 // selected: -- --------------
604 // alignr: HHhhhhhhhhhhhhhh hhLLLLLLLLLLLLLL
605 return x_mm256_alignr_epi8<COUNT - 16>(high, high0_low1);
606}
607
608// COUNT = 32
609template <size_t COUNT>
610static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
611 Range<true, 32, 48>)
612{
613 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
614 // low: LLLLLLLLLLLLLLLL llllllllllllllll
615 // HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
616 return high;
617}
618
619// COUNT = 33..47
620template <size_t COUNT>
621static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
622 Range<false, 32, 48>)
623{
624 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
625 // low: LLLLLLLLLLLLLLLL llllllllllllllll
626 // null_high1: 0000000000000000 HHHHHHHHHHHHHHHH (2,0) (0,1)
627 __m256i null_high1 =
628 _mm256_permute2f128_si256(high, high, _MM_SHUFFLE(2, 0, 0, 1));
629 // e.g. COUNT = 37 (37-32 = 5)
630 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
631 // null_high1: 0000000000000000 HHHHHHHHHHHHHHHH
632 // alignr H lane 0000000000000000 HHHHHHHHHHHHHHHH
633 // selected: ----- -----------
634 // alignr L lane HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
635 // selected: ----- -----------
636 // alignr: 00000HHHHHHHHHHH HHHHHhhhhhhhhhhh
637 return x_mm256_alignr_epi8<COUNT - 32>(null_high1, high);
638}
639
640// COUNT == 48
641template <size_t COUNT>
642static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
643 Range<true, 48, 64>)
644{
645 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
646 // low: LLLLLLLLLLLLLLLL llllllllllllllll
647 // null_high1: 0000000000000000 HHHHHHHHHHHHHHHH (2,0) (0,1)
648 __m256i null_high1 =
649 _mm256_permute2f128_si256(high, high, _MM_SHUFFLE(2, 0, 0, 1));
650 // null_high1: 0000000000000000 HHHHHHHHHHHHHHHH
651 return null_high1;
652}
653
654// COUNT = 49..63
655template <size_t COUNT>
656static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i,
657 Range<false, 48, 64>)
658{
659 // high: HHHHHHHHHHHHHHHH hhhhhhhhhhhhhhhh
660 // low: LLLLLLLLLLLLLLLL llllllllllllllll
661 // null_high1: 0000000000000000 HHHHHHHHHHHHHHHH (2,0) (0,1)
662 __m256i null_high1 =
663 _mm256_permute2f128_si256(high, high, _MM_SHUFFLE(2, 0, 0, 1));
664 // e.g. COUNT = 50 (50 - 48 = 2)
665 // null_high1: 0000000000000000 HHHHHHHHHHHHHHHH
666 // zero: 0000000000000000 0000000000000000
667 // alignr H lane: 0000000000000000 0000000000000000
668 // selected: -- --------------
669 // alignr L lane: 0000000000000000 HHHHHHHHHHHHHHHH
670 // selected: -- --------------
671 // alignr: 0000000000000000 00HHHHHHHHHHHHHH
672 return x_mm256_alignr_epi8<COUNT - 48>(_mm256_setzero_si256(), null_high1);
673}
674
675// COUNT >= 64
676template <size_t COUNT, bool AT_LOW_LIM, size_t LOW_LIM_INCL,
677 size_t UP_LIM_EXCL>
678static SIMD_INLINE __m256i x_mm256_alignr256_epi8(
679 __m256i, __m256i, Range<AT_LOW_LIM, LOW_LIM_INCL, UP_LIM_EXCL>)
680{
681 return _mm256_setzero_si256();
682}
683
684// hub
685template <size_t COUNT>
686static SIMD_INLINE __m256i x_mm256_alignr256_epi8(__m256i high, __m256i low)
687{
688 return x_mm256_alignr256_epi8<COUNT>(high, low, SizeRange<COUNT, 16>());
689}
690
691// ---------------------------------------------------------------------------
692// insert 16 byte vector a into both lanes of a 32 byte vector
693// ---------------------------------------------------------------------------
694
695static SIMD_INLINE __m256i x_mm256_duplicate_si128(__m128i a)
696{
697 return _mm256_set_m128i(a, a);
698}
699
700// ---------------------------------------------------------------------------
701// transpose4x64
702// ---------------------------------------------------------------------------
703
704// in = Hh Hl Lh Ll
705// | X |
706// out = Hh Lh Hl Ll
707
708static SIMD_INLINE __m256i x_mm256_transpose4x64_epi64(__m256i a)
709{
710#ifdef __AVX2__
711 return _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
712#else
713 // non-avx2 workarounds (different versions)
714
715#if 1
716 // non-avx2 workaround
717 // (more efficient)
718
719 __m256d in, x1, x2;
720 // in = Hh Hl Lh Ll
721 in = _mm256_castsi256_pd(a);
722 // only lower 4 bit are used
723 // in = Hh Hl Lh Ll
724 // 0 1 0 1 = (0,0,1,1)
725 // x1 = Hl Hh Ll Lh
726 x1 = _mm256_permute_pd(in, _MM_SHUFFLE(0, 0, 1, 1));
727 // all 8 bit are used
728 // x1 = Hl Hh Ll Lh
729 // 0 0 1 1
730 // x2 = Ll Lh Hl Hh
731 x2 = _mm256_permute2f128_pd(x1, x1, _MM_SHUFFLE(0, 0, 1, 1));
732 // only lower 4 bit are used
733 // in = Hh Hl Lh Ll
734 // x2 = Ll Lh Hl Hh
735 // 0 1 1 0 = (0,0,1,2)
736 // ret: Hh Lh Hl Ll
737 return _mm256_castpd_si256(_mm256_blend_pd(in, x2, _MM_SHUFFLE(0, 0, 1, 2)));
738#else
739 // non-avx2 workaround
740 // (less efficient)
741
742 __m128i lo = _mm256_castsi256_si128(a);
743 __m128i hi = _mm256_extractf128_si256(a, 1);
744 __m128i loRes = _mm_unpacklo_epi64(lo, hi);
745 __m128i hiRes = _mm_unpackhi_epi64(lo, hi);
746 return _mm256_set_m128i(hiRes, loRes);
747#endif
748
749#endif
750}
751
752static SIMD_INLINE __m256 x_mm256_transpose4x64_ps(__m256 a)
753{
754 return _mm256_castsi256_ps(
755 x_mm256_transpose4x64_epi64(_mm256_castps_si256(a)));
756}
757
758static SIMD_INLINE __m256d x_mm256_transpose4x64_pd(__m256d a)
759{
760 return _mm256_castsi256_pd(
761 x_mm256_transpose4x64_epi64(_mm256_castpd_si256(a)));
762}
763
764// ---------------------------------------------------------------------------
765// unpack of 2 ps
766// ---------------------------------------------------------------------------
767
768static SIMD_INLINE __m256 x_mm256_unpacklo_2ps(__m256 a, __m256 b)
769{
770 return _mm256_castpd_ps(
771 _mm256_unpacklo_pd(_mm256_castps_pd(a), _mm256_castps_pd(b)));
772}
773
774static SIMD_INLINE __m256 x_mm256_unpackhi_2ps(__m256 a, __m256 b)
775{
776 return _mm256_castpd_ps(
777 _mm256_unpackhi_pd(_mm256_castps_pd(a), _mm256_castps_pd(b)));
778}
779
780// ---------------------------------------------------------------------------
781// binary functions with non-avx2 workarounds
782// ---------------------------------------------------------------------------
783
784#ifdef __AVX2__
785// avx2 is available
786#define SIMDVEC_INTEL_X_INT_BINFCT_32(INTRIN) \
787 static SIMD_INLINE __m256i x_mm256_##INTRIN(__m256i a, __m256i b) \
788 { \
789 return _mm256_##INTRIN(a, b); \
790 }
791#else
792// non-avx2 workaround
793#define SIMDVEC_INTEL_X_INT_BINFCT_32(INTRIN) \
794 static SIMD_INLINE __m256i x_mm256_##INTRIN(__m256i a, __m256i b) \
795 { \
796 return _mm256_set_m128i( \
797 _mm_##INTRIN(_mm256_extractf128_si256(a, 1), \
798 _mm256_extractf128_si256(b, 1)), \
799 _mm_##INTRIN(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))); \
800 }
801#endif
802
803SIMDVEC_INTEL_X_INT_BINFCT_32(unpacklo_epi8)
804SIMDVEC_INTEL_X_INT_BINFCT_32(unpackhi_epi8)
805SIMDVEC_INTEL_X_INT_BINFCT_32(unpacklo_epi16)
806SIMDVEC_INTEL_X_INT_BINFCT_32(unpackhi_epi16)
807SIMDVEC_INTEL_X_INT_BINFCT_32(shuffle_epi8)
808SIMDVEC_INTEL_X_INT_BINFCT_32(packs_epi16)
809SIMDVEC_INTEL_X_INT_BINFCT_32(packs_epi32)
810SIMDVEC_INTEL_X_INT_BINFCT_32(packus_epi16)
811SIMDVEC_INTEL_X_INT_BINFCT_32(packus_epi32)
812SIMDVEC_INTEL_X_INT_BINFCT_32(hadd_epi16)
813SIMDVEC_INTEL_X_INT_BINFCT_32(hadd_epi32)
814SIMDVEC_INTEL_X_INT_BINFCT_32(hadds_epi16)
815SIMDVEC_INTEL_X_INT_BINFCT_32(hsub_epi16)
816SIMDVEC_INTEL_X_INT_BINFCT_32(hsub_epi32)
817SIMDVEC_INTEL_X_INT_BINFCT_32(hsubs_epi16)
818
819// non-avx2 workarounds via analogous ps, pd functions
820#ifdef __AVX2__
821// avx2 is available
822#define SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(INTRIN, INTSUFFIX, PSPDSUFFIX) \
823 static SIMD_INLINE __m256i x_mm256_##INTRIN##_##INTSUFFIX(__m256i a, \
824 __m256i b) \
825 { \
826 return _mm256_##INTRIN##_##INTSUFFIX(a, b); \
827 }
828#else
829// non-avx2 workaround
830#define SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(INTRIN, INTSUFFIX, PSPDSUFFIX) \
831 static SIMD_INLINE __m256i x_mm256_##INTRIN##_##INTSUFFIX(__m256i a, \
832 __m256i b) \
833 { \
834 return _mm256_cast##PSPDSUFFIX##_si256( \
835 _mm256_##INTRIN##_##PSPDSUFFIX(_mm256_castsi256##_##PSPDSUFFIX(a), \
836 _mm256_castsi256##_##PSPDSUFFIX(b))); \
837 }
838#endif
839
840// better non-avx2 workarounds for unpacks (32, 64) via ps, pd
841SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpacklo, epi32, ps)
842SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpackhi, epi32, ps)
843SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpacklo, epi64, pd)
844SIMDVEC_INTEL_X_INT_BINFCT_PSPD_32(unpackhi, epi64, pd)
845
846// ###########################################################################
847// ###########################################################################
848// ###########################################################################
849
850// ===========================================================================
851// Vec template function specializations or overloading for AVX
852// ===========================================================================
853
854// ---------------------------------------------------------------------------
855// reinterpretation casts
856// ---------------------------------------------------------------------------
857
858// 08. Apr 23 (Jonas Keller): used enable_if for cleaner implementation
859
860// between all integer types
861template <typename Tdst, typename Tsrc,
862 SIMD_ENABLE_IF((!std::is_same<Tdst, Tsrc>::value &&
863 std::is_integral<Tdst>::value &&
864 std::is_integral<Tsrc>::value))>
865static SIMD_INLINE Vec<Tdst, 32> reinterpret(const Vec<Tsrc, 32> &vec,
866 OutputType<Tdst>)
867{
868 // 26. Nov 22 (Jonas Keller): reinterpret_cast is technically undefined
869 // behavior, so just rewrapping the vector register in a new Vec instead
870 // return reinterpret_cast<const Vec<Tdst,32>&>(vec);
871 return Vec<Tdst, 32>(__m256i(vec));
872}
873
874// from float to any integer type
875template <typename Tdst, SIMD_ENABLE_IF((std::is_integral<Tdst>::value))>
876static SIMD_INLINE Vec<Tdst, 32> reinterpret(const Vec<Float, 32> &vec,
877 OutputType<Tdst>)
878{
879 return _mm256_castps_si256(vec);
880}
881
882// from any integer type to float
883template <typename Tsrc, SIMD_ENABLE_IF((std::is_integral<Tsrc>::value))>
884static SIMD_INLINE Vec<Float, 32> reinterpret(const Vec<Tsrc, 32> &vec,
885 OutputType<Float>)
886{
887 return _mm256_castsi256_ps(vec);
888}
889
890// from double to any integer type
891template <typename Tdst, SIMD_ENABLE_IF((std::is_integral<Tdst>::value))>
892static SIMD_INLINE Vec<Tdst, 32> reinterpret(const Vec<Double, 32> &vec,
893 OutputType<Tdst>)
894{
895 return _mm256_castpd_si256(vec);
896}
897
898// from any integer type to double
899template <typename Tsrc, SIMD_ENABLE_IF((std::is_integral<Tsrc>::value))>
900static SIMD_INLINE Vec<Double, 32> reinterpret(const Vec<Tsrc, 32> &vec,
901 OutputType<Double>)
902{
903 return _mm256_castsi256_pd(vec);
904}
905
906// from float to double
907static SIMD_INLINE Vec<Double, 32> reinterpret(const Vec<Float, 32> &vec,
908 OutputType<Double>)
909{
910 return _mm256_castps_pd(vec);
911}
912
913// from double to float
914static SIMD_INLINE Vec<Float, 32> reinterpret(const Vec<Double, 32> &vec,
915 OutputType<Float>)
916{
917 return _mm256_castpd_ps(vec);
918}
919
920// between identical types
921template <typename T>
922static SIMD_INLINE Vec<T, 32> reinterpret(const Vec<T, 32> &vec, OutputType<T>)
923{
924 return vec;
925}
926
927// ---------------------------------------------------------------------------
928// convert (without changes in the number of of elements)
929// ---------------------------------------------------------------------------
930
931// conversion with saturation; we wanted to have a fast solution that
932// doesn't trigger the overflow which results in a negative two's
933// complement result ("invalid int32": 0x80000000); therefore we clamp
934// the positive values at the maximal positive float which is
935// convertible to int32 without overflow (0x7fffffbf = 2147483520);
936// negative values cannot overflow (they are clamped to invalid int
937// which is the most negative int32)
938static SIMD_INLINE Vec<Int, 32> cvts(const Vec<Float, 32> &a, OutputType<Int>)
939{
940 // TODO: analyze much more complex solution for cvts at
941 // TODO: http://stackoverflow.com/questions/9157373/
942 // TODO: most-efficient-way-to-convert-vector-of-float-to-vector-of-uint32
943 __m256 clip = _mm256_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
944 return _mm256_cvtps_epi32(_mm256_min_ps(clip, a));
945}
946
947// saturation is not necessary in this case
948static SIMD_INLINE Vec<Float, 32> cvts(const Vec<Int, 32> &a, OutputType<Float>)
949{
950 return _mm256_cvtepi32_ps(a);
951}
952
953static SIMD_INLINE Vec<Long, 32> cvts(const Vec<Double, 32> &a,
954 OutputType<Long>)
955{
956 // _mm256_cvtpd_epi64 is only available with AVX512
957 // using serial workaround instead
958 Double tmpD[4] SIMD_ATTR_ALIGNED(32);
959 _mm256_store_pd(tmpD, a);
960 Long tmpL[4] SIMD_ATTR_ALIGNED(32);
961 for (int i = 0; i < 4; ++i) {
962 tmpL[i] =
963 Long(std::rint(std::min(tmpD[i], MAX_POS_DOUBLE_CONVERTIBLE_TO_INT64)));
964 }
965 return _mm256_load_si256((__m256i *) tmpL);
966}
967
968static SIMD_INLINE Vec<Double, 32> cvts(const Vec<Long, 32> &a,
969 OutputType<Double>)
970{
971#ifdef __AVX2__
972 // workaround from https://stackoverflow.com/a/41148578 (modified)
973 __m256i xH = _mm256_srai_epi32(a, 16);
974 xH = _mm256_and_si256(xH, _mm256_set1_epi64x(0xffffffff00000000));
975 xH = _mm256_add_epi64(
976 xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67
977 __m256i xL = _mm256_blend_epi16(
978 a, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0x88); // 2^52
979 __m256d f =
980 _mm256_sub_pd(_mm256_castsi256_pd(xH),
981 _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
982 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
983#else
984 // non-avx2 workaround
985 return Vec<Double, 32>(cvts(a.lo(), OutputType<Double>()),
986 cvts(a.hi(), OutputType<Double>()));
987#endif
988}
989
990// ---------------------------------------------------------------------------
991// setzero
992// ---------------------------------------------------------------------------
993
994template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
995static SIMD_INLINE Vec<T, 32> setzero(OutputType<T>, Integer<32>)
996{
997 return _mm256_setzero_si256();
998}
999
1000static SIMD_INLINE Vec<Float, 32> setzero(OutputType<Float>, Integer<32>)
1001{
1002 return _mm256_setzero_ps();
1003}
1004
1005static SIMD_INLINE Vec<Double, 32> setzero(OutputType<Double>, Integer<32>)
1006{
1007 return _mm256_setzero_pd();
1008}
1009
1010// ---------------------------------------------------------------------------
1011// set1
1012// ---------------------------------------------------------------------------
1013
1014static SIMD_INLINE Vec<Byte, 32> set1(Byte a, Integer<32>)
1015{
1016 return _mm256_set1_epi8(a);
1017}
1018
1019static SIMD_INLINE Vec<SignedByte, 32> set1(SignedByte a, Integer<32>)
1020{
1021 return _mm256_set1_epi8(a);
1022}
1023
1024static SIMD_INLINE Vec<Word, 32> set1(Word a, Integer<32>)
1025{
1026 return _mm256_set1_epi16(a);
1027}
1028
1029static SIMD_INLINE Vec<Short, 32> set1(Short a, Integer<32>)
1030{
1031 return _mm256_set1_epi16(a);
1032}
1033
1034static SIMD_INLINE Vec<Int, 32> set1(Int a, Integer<32>)
1035{
1036 return _mm256_set1_epi32(a);
1037}
1038
1039static SIMD_INLINE Vec<Long, 32> set1(Long a, Integer<32>)
1040{
1041 return _mm256_set1_epi64x(a);
1042}
1043
1044static SIMD_INLINE Vec<Float, 32> set1(Float a, Integer<32>)
1045{
1046 return _mm256_set1_ps(a);
1047}
1048
1049static SIMD_INLINE Vec<Double, 32> set1(Double a, Integer<32>)
1050{
1051 return _mm256_set1_pd(a);
1052}
1053
1054// ---------------------------------------------------------------------------
1055// load
1056// ---------------------------------------------------------------------------
1057
1058template <typename T>
1059static SIMD_INLINE Vec<T, 32> load(const T *const p, Integer<32>)
1060{
1061 // AVX load and store instructions need alignment to 32 byte
1062 // (lower 5 bit need to be zero)
1063 SIMD_CHECK_ALIGNMENT(p, 32);
1064 return _mm256_load_si256((__m256i *) p);
1065}
1066
1067static SIMD_INLINE Vec<Float, 32> load(const Float *const p, Integer<32>)
1068{
1069 // AVX load and store instructions need alignment to 32 byte
1070 // (lower 5 bit need to be zero)
1071 SIMD_CHECK_ALIGNMENT(p, 32);
1072 return _mm256_load_ps(p);
1073}
1074
1075static SIMD_INLINE Vec<Double, 32> load(const Double *const p, Integer<32>)
1076{
1077 // AVX load and store instructions need alignment to 32 byte
1078 // (lower 5 bit need to be zero)
1079 SIMD_CHECK_ALIGNMENT(p, 32);
1080 return _mm256_load_pd(p);
1081}
1082
1083// ---------------------------------------------------------------------------
1084// loadu
1085// ---------------------------------------------------------------------------
1086
1087template <typename T>
1088static SIMD_INLINE Vec<T, 32> loadu(const T *const p, Integer<32>)
1089{
1090 return _mm256_loadu_si256((__m256i *) p);
1091}
1092
1093static SIMD_INLINE Vec<Float, 32> loadu(const Float *const p, Integer<32>)
1094{
1095 return _mm256_loadu_ps(p);
1096}
1097
1098static SIMD_INLINE Vec<Double, 32> loadu(const Double *const p, Integer<32>)
1099{
1100 return _mm256_loadu_pd(p);
1101}
1102
1103// ---------------------------------------------------------------------------
1104// store
1105// ---------------------------------------------------------------------------
1106
1107// all integer versions
1108template <typename T>
1109static SIMD_INLINE void store(T *const p, const Vec<T, 32> &a)
1110{
1111 // AVX load and store instructions need alignment to 32 byte
1112 // (lower 5 bit need to be zero)
1113 SIMD_CHECK_ALIGNMENT(p, 32);
1114 _mm256_store_si256((__m256i *) p, a);
1115}
1116
1117// float version
1118static SIMD_INLINE void store(Float *const p, const Vec<Float, 32> &a)
1119{
1120 // AVX load and store instructions need alignment to 32 byte
1121 // (lower 5 bit need to be zero)
1122 SIMD_CHECK_ALIGNMENT(p, 32);
1123 _mm256_store_ps(p, a);
1124}
1125
1126// double version
1127static SIMD_INLINE void store(Double *const p, const Vec<Double, 32> &a)
1128{
1129 // AVX load and store instructions need alignment to 32 byte
1130 // (lower 5 bit need to be zero)
1131 SIMD_CHECK_ALIGNMENT(p, 32);
1132 _mm256_store_pd(p, a);
1133}
1134
1135// ---------------------------------------------------------------------------
1136// storeu
1137// ---------------------------------------------------------------------------
1138
1139// all integer versions
1140template <typename T>
1141static SIMD_INLINE void storeu(T *const p, const Vec<T, 32> &a)
1142{
1143 _mm256_storeu_si256((__m256i *) p, a);
1144}
1145
1146// float version
1147static SIMD_INLINE void storeu(Float *const p, const Vec<Float, 32> &a)
1148{
1149 _mm256_storeu_ps(p, a);
1150}
1151
1152// double version
1153static SIMD_INLINE void storeu(Double *const p, const Vec<Double, 32> &a)
1154{
1155 _mm256_storeu_pd(p, a);
1156}
1157
1158// ---------------------------------------------------------------------------
1159// stream_store
1160// ---------------------------------------------------------------------------
1161
1162// all integer versions
1163template <typename T>
1164static SIMD_INLINE void stream_store(T *const p, const Vec<T, 32> &a)
1165{
1166 // AVX load and store instructions need alignment to 32 byte
1167 // (lower 5 bit need to be zero)
1168 SIMD_CHECK_ALIGNMENT(p, 32);
1169 _mm256_stream_si256((__m256i *) p, a);
1170}
1171
1172// float version
1173static SIMD_INLINE void stream_store(Float *const p, const Vec<Float, 32> &a)
1174{
1175 // AVX load and store instructions need alignment to 32 byte
1176 // (lower 5 bit need to be zero)
1177 SIMD_CHECK_ALIGNMENT(p, 32);
1178 _mm256_stream_ps(p, a);
1179}
1180
1181// double version
1182static SIMD_INLINE void stream_store(Double *const p, const Vec<Double, 32> &a)
1183{
1184 // AVX load and store instructions need alignment to 32 byte
1185 // (lower 5 bit need to be zero)
1186 SIMD_CHECK_ALIGNMENT(p, 32);
1187 _mm256_stream_pd(p, a);
1188}
1189
1190// ---------------------------------------------------------------------------
1191// extract
1192// ---------------------------------------------------------------------------
1193
1194template <size_t COUNT>
1195static SIMD_INLINE Byte extract(const Vec<Byte, 32> &a)
1196{
1197 SIMD_IF_CONSTEXPR (COUNT < 32) {
1198 // strange, Intel intrinsics guide says this is AVX2, but it is already
1199 // available in avxintrin.h
1200 return _mm256_extract_epi8(a, COUNT);
1201 } else {
1202 return 0;
1203 }
1204}
1205
1206template <size_t COUNT>
1207static SIMD_INLINE SignedByte extract(const Vec<SignedByte, 32> &a)
1208{
1209 return ::simd::internal::bit_cast<SignedByte>(
1210 extract<COUNT>(reinterpret(a, OutputType<Byte>())));
1211}
1212
1213template <size_t COUNT>
1214static SIMD_INLINE Word extract(const Vec<Word, 32> &a)
1215{
1216 SIMD_IF_CONSTEXPR (COUNT < 16) {
1217 // strange, Intel intrinsics guide says this is AVX2, but it is already
1218 // available in avxintrin.h
1219 return _mm256_extract_epi16(a, COUNT);
1220 } else {
1221 return 0;
1222 }
1223}
1224
1225template <size_t COUNT>
1226static SIMD_INLINE Short extract(const Vec<Short, 32> &a)
1227{
1228 return ::simd::internal::bit_cast<Short>(
1229 extract<COUNT>(reinterpret(a, OutputType<Word>())));
1230}
1231
1232template <size_t COUNT>
1233static SIMD_INLINE Int extract(const Vec<Int, 32> &a)
1234{
1235 SIMD_IF_CONSTEXPR (COUNT < 8) {
1236 return _mm256_extract_epi32(a, COUNT);
1237 } else {
1238 return 0;
1239 }
1240}
1241
1242template <size_t COUNT>
1243static SIMD_INLINE Long extract(const Vec<Long, 32> &a)
1244{
1245 SIMD_IF_CONSTEXPR (COUNT < 4) {
1246 return _mm256_extract_epi64(a, COUNT);
1247 } else {
1248 return 0;
1249 }
1250}
1251
1252template <size_t COUNT>
1253static SIMD_INLINE Float extract(const Vec<Float, 32> &a)
1254{
1255 return ::simd::internal::bit_cast<Float>(
1256 extract<COUNT>(reinterpret(a, OutputType<Int>())));
1257}
1258
1259template <size_t COUNT>
1260static SIMD_INLINE Double extract(const Vec<Double, 32> &a)
1261{
1262 SIMD_IF_CONSTEXPR (COUNT < 4) {
1263 return ::simd::internal::bit_cast<Double>(
1264 _mm256_extract_epi64(_mm256_castpd_si256(a), COUNT));
1265 } else {
1266 return 0;
1267 }
1268}
1269
1270// ---------------------------------------------------------------------------
1271// add
1272// ---------------------------------------------------------------------------
1273
1274#ifdef __AVX2__
1275
1276static SIMD_INLINE Vec<Byte, 32> add(const Vec<Byte, 32> &a,
1277 const Vec<Byte, 32> &b)
1278{
1279 return _mm256_add_epi8(a, b);
1280}
1281
1282static SIMD_INLINE Vec<SignedByte, 32> add(const Vec<SignedByte, 32> &a,
1283 const Vec<SignedByte, 32> &b)
1284{
1285 return _mm256_add_epi8(a, b);
1286}
1287
1288static SIMD_INLINE Vec<Word, 32> add(const Vec<Word, 32> &a,
1289 const Vec<Word, 32> &b)
1290{
1291 return _mm256_add_epi16(a, b);
1292}
1293
1294static SIMD_INLINE Vec<Short, 32> add(const Vec<Short, 32> &a,
1295 const Vec<Short, 32> &b)
1296{
1297 return _mm256_add_epi16(a, b);
1298}
1299
1300static SIMD_INLINE Vec<Int, 32> add(const Vec<Int, 32> &a,
1301 const Vec<Int, 32> &b)
1302{
1303 return _mm256_add_epi32(a, b);
1304}
1305
1306static SIMD_INLINE Vec<Long, 32> add(const Vec<Long, 32> &a,
1307 const Vec<Long, 32> &b)
1308{
1309 return _mm256_add_epi64(a, b);
1310}
1311
1312#else
1313
1314// non-avx2 workaround
1315template <typename T>
1316static SIMD_INLINE Vec<T, 32> add(const Vec<T, 32> &a, const Vec<T, 32> &b)
1317{
1318 return Vec<T, 32>(add(a.lo(), b.lo()), add(a.hi(), b.hi()));
1319}
1320
1321#endif
1322
1323static SIMD_INLINE Vec<Float, 32> add(const Vec<Float, 32> &a,
1324 const Vec<Float, 32> &b)
1325{
1326 return _mm256_add_ps(a, b);
1327}
1328
1329static SIMD_INLINE Vec<Double, 32> add(const Vec<Double, 32> &a,
1330 const Vec<Double, 32> &b)
1331{
1332 return _mm256_add_pd(a, b);
1333}
1334
1335// ---------------------------------------------------------------------------
1336// adds
1337// ---------------------------------------------------------------------------
1338
1339#ifdef __AVX2__
1340
1341static SIMD_INLINE Vec<Byte, 32> adds(const Vec<Byte, 32> &a,
1342 const Vec<Byte, 32> &b)
1343{
1344 return _mm256_adds_epu8(a, b);
1345}
1346
1347static SIMD_INLINE Vec<SignedByte, 32> adds(const Vec<SignedByte, 32> &a,
1348 const Vec<SignedByte, 32> &b)
1349{
1350 return _mm256_adds_epi8(a, b);
1351}
1352
1353static SIMD_INLINE Vec<Word, 32> adds(const Vec<Word, 32> &a,
1354 const Vec<Word, 32> &b)
1355{
1356 return _mm256_adds_epu16(a, b);
1357}
1358
1359static SIMD_INLINE Vec<Short, 32> adds(const Vec<Short, 32> &a,
1360 const Vec<Short, 32> &b)
1361{
1362 return _mm256_adds_epi16(a, b);
1363}
1364
1365static SIMD_INLINE Vec<Int, 32> adds(const Vec<Int, 32> &a,
1366 const Vec<Int, 32> &b)
1367{
1368 // 09. Mar 23 (Jonas Keller): added workaround so that this function is
1369 // saturated
1370
1371 // _mm256_adds_epi32 does not exist, workaround:
1372 // Hacker's Delight, 2-13 Overflow Detection: "Signed integer overflow of
1373 // addition occurs if and only if the operands have the same sign and the
1374 // sum has a sign opposite to that of the operands."
1375 __m256i sum = _mm256_add_epi32(a, b);
1376 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1377 __m256i sumHasDiffSign = _mm256_xor_si256(a, sum);
1378 // indicates when an overflow has occurred
1379 __m256i overflow =
1380 _mm256_srai_epi32(_mm256_andnot_si256(opsHaveDiffSign, sumHasDiffSign), 31);
1381 // saturated sum for if overflow occurred (0x7FFFFFFF=max positive int, when
1382 // sign of a (and thus b as well) is 0, 0x80000000=min negative int, when sign
1383 // of a (and thus b as well) is 1)
1384 __m256i saturatedSum =
1385 _mm256_xor_si256(_mm256_srai_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF));
1386 // return saturated sum if overflow occurred, otherwise return sum
1387 return _mm256_or_si256(_mm256_andnot_si256(overflow, sum),
1388 _mm256_and_si256(overflow, saturatedSum));
1389}
1390
1391static SIMD_INLINE Vec<Long, 32> adds(const Vec<Long, 32> &a,
1392 const Vec<Long, 32> &b)
1393{
1394 // _mm256_adds_epi64 does not exist, workaround:
1395 // Hacker's Delight, 2-13 Overflow Detection: "Signed integer overflow of
1396 // addition occurs if and only if the operands have the same sign and the
1397 // sum has a sign opposite to that of the operands."
1398 __m256i sum = _mm256_add_epi64(a, b);
1399 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1400 __m256i sumHasDiffSign = _mm256_xor_si256(a, sum);
1401 // indicates when an overflow has occurred
1402 __m256i overflow32 =
1403 _mm256_srai_epi32(_mm256_andnot_si256(opsHaveDiffSign, sumHasDiffSign), 31);
1404 // duplicate result to other half of 64 bit int
1405 __m256i overflow = _mm256_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
1406 // saturated sum for if overflow occurred (0x7FFFFFFFFFFFFFFF=max positive
1407 // long, when sign of a (and thus b as well) is 0, 0x8000000000000000=min
1408 // negative long, when sign of a (and thus b as well) is 1)
1409 __m256i saturatedSum = _mm256_xor_si256(
1410 _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)),
1411 _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
1412 // return saturated sum if overflow occurred, otherwise return sum
1413 return _mm256_or_si256(_mm256_andnot_si256(overflow, sum),
1414 _mm256_and_si256(overflow, saturatedSum));
1415}
1416
1417#else
1418
1419// non-avx2 workaround
1420template <typename T>
1421static SIMD_INLINE Vec<T, 32> adds(const Vec<T, 32> &a, const Vec<T, 32> &b)
1422{
1423 return Vec<T, 32>(adds(a.lo(), b.lo()), adds(a.hi(), b.hi()));
1424}
1425
1426#endif
1427
1428// Float not saturated
1429static SIMD_INLINE Vec<Float, 32> adds(const Vec<Float, 32> &a,
1430 const Vec<Float, 32> &b)
1431{
1432 return _mm256_add_ps(a, b);
1433}
1434
1435// Double not saturated
1436static SIMD_INLINE Vec<Double, 32> adds(const Vec<Double, 32> &a,
1437 const Vec<Double, 32> &b)
1438{
1439 return _mm256_add_pd(a, b);
1440}
1441
1442// ---------------------------------------------------------------------------
1443// sub
1444// ---------------------------------------------------------------------------
1445
1446#ifdef __AVX2__
1447
1448static SIMD_INLINE Vec<Byte, 32> sub(const Vec<Byte, 32> &a,
1449 const Vec<Byte, 32> &b)
1450{
1451 return _mm256_sub_epi8(a, b);
1452}
1453
1454static SIMD_INLINE Vec<SignedByte, 32> sub(const Vec<SignedByte, 32> &a,
1455 const Vec<SignedByte, 32> &b)
1456{
1457 return _mm256_sub_epi8(a, b);
1458}
1459
1460static SIMD_INLINE Vec<Word, 32> sub(const Vec<Word, 32> &a,
1461 const Vec<Word, 32> &b)
1462{
1463 return _mm256_sub_epi16(a, b);
1464}
1465
1466static SIMD_INLINE Vec<Short, 32> sub(const Vec<Short, 32> &a,
1467 const Vec<Short, 32> &b)
1468{
1469 return _mm256_sub_epi16(a, b);
1470}
1471
1472static SIMD_INLINE Vec<Int, 32> sub(const Vec<Int, 32> &a,
1473 const Vec<Int, 32> &b)
1474{
1475 return _mm256_sub_epi32(a, b);
1476}
1477
1478static SIMD_INLINE Vec<Long, 32> sub(const Vec<Long, 32> &a,
1479 const Vec<Long, 32> &b)
1480{
1481 return _mm256_sub_epi64(a, b);
1482}
1483
1484#else
1485
1486// non-avx2 workaround
1487template <typename T>
1488static SIMD_INLINE Vec<T, 32> sub(const Vec<T, 32> &a, const Vec<T, 32> &b)
1489{
1490 return Vec<T, 32>(sub(a.lo(), b.lo()), sub(a.hi(), b.hi()));
1491}
1492
1493#endif
1494
1495static SIMD_INLINE Vec<Float, 32> sub(const Vec<Float, 32> &a,
1496 const Vec<Float, 32> &b)
1497{
1498 return _mm256_sub_ps(a, b);
1499}
1500
1501static SIMD_INLINE Vec<Double, 32> sub(const Vec<Double, 32> &a,
1502 const Vec<Double, 32> &b)
1503{
1504 return _mm256_sub_pd(a, b);
1505}
1506
1507// ---------------------------------------------------------------------------
1508// subs
1509// ---------------------------------------------------------------------------
1510
1511#ifdef __AVX2__
1512
1513static SIMD_INLINE Vec<Byte, 32> subs(const Vec<Byte, 32> &a,
1514 const Vec<Byte, 32> &b)
1515{
1516 return _mm256_subs_epu8(a, b);
1517}
1518
1519static SIMD_INLINE Vec<SignedByte, 32> subs(const Vec<SignedByte, 32> &a,
1520 const Vec<SignedByte, 32> &b)
1521{
1522 return _mm256_subs_epi8(a, b);
1523}
1524
1525static SIMD_INLINE Vec<Word, 32> subs(const Vec<Word, 32> &a,
1526 const Vec<Word, 32> &b)
1527{
1528 return _mm256_subs_epu16(a, b);
1529}
1530
1531static SIMD_INLINE Vec<Short, 32> subs(const Vec<Short, 32> &a,
1532 const Vec<Short, 32> &b)
1533{
1534 return _mm256_subs_epi16(a, b);
1535}
1536
1537static SIMD_INLINE Vec<Int, 32> subs(const Vec<Int, 32> &a,
1538 const Vec<Int, 32> &b)
1539{
1540 // 09. Mar 23 (Jonas Keller): added workaround so that this function is
1541 // saturated
1542
1543 // _mm256_subs_epi32 does not exist, workaround:
1544 // Hacker's Delight, 2-13 Overflow Detection: "[...] overflow in the final
1545 // value of x−y [...] occurs if and only if x and y have opposite signs and
1546 // the sign of x−y [...] is opposite to that of x [...]"
1547 __m256i diff = _mm256_sub_epi32(a, b);
1548 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1549 __m256i diffHasDiffSign = _mm256_xor_si256(a, diff);
1550 // indicates when an overflow has occurred
1551 __m256i overflow =
1552 _mm256_srai_epi32(_mm256_and_si256(opsHaveDiffSign, diffHasDiffSign), 31);
1553 // saturated diff for if overflow occurred (0x7FFFFFFF=max positive int, when
1554 // sign of a (and thus b as well) is 0, 0x80000000=min negative int, when sign
1555 // of a (and thus b as well) is 1)
1556 __m256i saturatedDiff =
1557 _mm256_xor_si256(_mm256_srai_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF));
1558 // return saturated diff if overflow occurred, otherwise return diff
1559 return _mm256_or_si256(_mm256_andnot_si256(overflow, diff),
1560 _mm256_and_si256(overflow, saturatedDiff));
1561}
1562
1563static SIMD_INLINE Vec<Long, 32> subs(const Vec<Long, 32> &a,
1564 const Vec<Long, 32> &b)
1565{
1566 // _mm256_subs_epi64 does not exist, workaround:
1567 // Hacker's Delight, 2-13 Overflow Detection: "[...] overflow in the final
1568 // value of x−y [...] occurs if and only if x and y have opposite signs and
1569 // the sign of x−y [...] is opposite to that of x [...]"
1570 __m256i diff = _mm256_sub_epi64(a, b);
1571 __m256i opsHaveDiffSign = _mm256_xor_si256(a, b);
1572 __m256i diffHasDiffSign = _mm256_xor_si256(a, diff);
1573 // indicates when an overflow has occurred
1574 __m256i overflow32 =
1575 _mm256_srai_epi32(_mm256_and_si256(opsHaveDiffSign, diffHasDiffSign), 31);
1576 // duplicate result to other half of 64 bit int
1577 __m256i overflow = _mm256_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
1578 // saturated diff for if overflow occurred (0x7FFFFFFFFFFFFFFF=max positive
1579 // long, when sign of a (and thus b as well) is 0, 0x8000000000000000=min
1580 // negative long, when sign of a (and thus b as well) is 1)
1581 __m256i saturatedDiff = _mm256_xor_si256(
1582 _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)),
1583 _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
1584 // return saturated diff if overflow occurred, otherwise return diff
1585 return _mm256_or_si256(_mm256_andnot_si256(overflow, diff),
1586 _mm256_and_si256(overflow, saturatedDiff));
1587}
1588
1589#else
1590
1591// non-avx2 workaround
1592template <typename T>
1593static SIMD_INLINE Vec<T, 32> subs(const Vec<T, 32> &a, const Vec<T, 32> &b)
1594{
1595 return Vec<T, 32>(subs(a.lo(), b.lo()), subs(a.hi(), b.hi()));
1596}
1597
1598#endif
1599
1600// Float not saturated
1601static SIMD_INLINE Vec<Float, 32> subs(const Vec<Float, 32> &a,
1602 const Vec<Float, 32> &b)
1603{
1604 return _mm256_sub_ps(a, b);
1605}
1606
1607// Double not saturated
1608static SIMD_INLINE Vec<Double, 32> subs(const Vec<Double, 32> &a,
1609 const Vec<Double, 32> &b)
1610{
1611 return _mm256_sub_pd(a, b);
1612}
1613
1614// ---------------------------------------------------------------------------
1615// neg (negate = two's complement or unary minus), only signed types
1616// ---------------------------------------------------------------------------
1617
1618#ifdef __AVX2__
1619
1620static SIMD_INLINE Vec<SignedByte, 32> neg(const Vec<SignedByte, 32> &a)
1621{
1622 return _mm256_sub_epi8(_mm256_setzero_si256(), a);
1623}
1624
1625static SIMD_INLINE Vec<Short, 32> neg(const Vec<Short, 32> &a)
1626{
1627 return _mm256_sub_epi16(_mm256_setzero_si256(), a);
1628}
1629
1630static SIMD_INLINE Vec<Int, 32> neg(const Vec<Int, 32> &a)
1631{
1632 return _mm256_sub_epi32(_mm256_setzero_si256(), a);
1633}
1634
1635static SIMD_INLINE Vec<Long, 32> neg(const Vec<Long, 32> &a)
1636{
1637 return _mm256_sub_epi64(_mm256_setzero_si256(), a);
1638}
1639
1640#else
1641
1642// non-avx2 workaround
1643template <typename T>
1644static SIMD_INLINE Vec<T, 32> neg(const Vec<T, 32> &a)
1645{
1646 return Vec<T, 32>(neg(a.lo()), neg(a.hi()));
1647}
1648
1649#endif
1650
1651static SIMD_INLINE Vec<Float, 32> neg(const Vec<Float, 32> &a)
1652{
1653 return _mm256_sub_ps(_mm256_setzero_ps(), a);
1654}
1655
1656static SIMD_INLINE Vec<Double, 32> neg(const Vec<Double, 32> &a)
1657{
1658 return _mm256_sub_pd(_mm256_setzero_pd(), a);
1659}
1660
1661// ---------------------------------------------------------------------------
1662// min
1663// ---------------------------------------------------------------------------
1664
1665#ifdef __AVX2__
1666
1667static SIMD_INLINE Vec<Byte, 32> min(const Vec<Byte, 32> &a,
1668 const Vec<Byte, 32> &b)
1669{
1670 return _mm256_min_epu8(a, b);
1671}
1672
1673static SIMD_INLINE Vec<SignedByte, 32> min(const Vec<SignedByte, 32> &a,
1674 const Vec<SignedByte, 32> &b)
1675{
1676 return _mm256_min_epi8(a, b);
1677}
1678
1679static SIMD_INLINE Vec<Word, 32> min(const Vec<Word, 32> &a,
1680 const Vec<Word, 32> &b)
1681{
1682 return _mm256_min_epu16(a, b);
1683}
1684
1685static SIMD_INLINE Vec<Short, 32> min(const Vec<Short, 32> &a,
1686 const Vec<Short, 32> &b)
1687{
1688 return _mm256_min_epi16(a, b);
1689}
1690
1691static SIMD_INLINE Vec<Int, 32> min(const Vec<Int, 32> &a,
1692 const Vec<Int, 32> &b)
1693{
1694 return _mm256_min_epi32(a, b);
1695}
1696
1697// there is an unsigned version of min for 32 bit but we currently
1698// don't have an element type for it
1699
1700static SIMD_INLINE Vec<Long, 32> min(const Vec<Long, 32> &a,
1701 const Vec<Long, 32> &b)
1702{
1703 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
1704 const __m256i diff = _mm256_sub_epi64(b, a);
1705#if 1 // TODO: check which is faster
1706 const __m256i res = _mm256_xor_si256(
1707 diff, _mm256_and_si256(_mm256_xor_si256(b, a), _mm256_xor_si256(diff, b)));
1708#else
1709 const __m256i res =
1710 _mm256_or_si256(_mm256_andnot_si256(a, b),
1711 _mm256_andnot_si256(_mm256_xor_si256(b, a), diff));
1712#endif
1713 // result in highest bit of res
1714 // spread highest bit to all bits
1715 const __m256i spread32 = _mm256_srai_epi32(res, 31);
1716 const __m256i gt = _mm256_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1717
1718 // blend a and b according to gt
1719 return _mm256_blendv_epi8(a, b, gt);
1720}
1721
1722#else
1723
1724// non-avx2 workaround
1725template <typename T>
1726static SIMD_INLINE Vec<T, 32> min(const Vec<T, 32> &a, const Vec<T, 32> &b)
1727{
1728 return Vec<T, 32>(min(a.lo(), b.lo()), min(a.hi(), b.hi()));
1729}
1730
1731#endif
1732
1733static SIMD_INLINE Vec<Float, 32> min(const Vec<Float, 32> &a,
1734 const Vec<Float, 32> &b)
1735{
1736 return _mm256_min_ps(a, b);
1737}
1738
1739static SIMD_INLINE Vec<Double, 32> min(const Vec<Double, 32> &a,
1740 const Vec<Double, 32> &b)
1741{
1742 return _mm256_min_pd(a, b);
1743}
1744
1745// ---------------------------------------------------------------------------
1746// max
1747// ---------------------------------------------------------------------------
1748
1749#ifdef __AVX2__
1750
1751static SIMD_INLINE Vec<Byte, 32> max(const Vec<Byte, 32> &a,
1752 const Vec<Byte, 32> &b)
1753{
1754 return _mm256_max_epu8(a, b);
1755}
1756
1757static SIMD_INLINE Vec<SignedByte, 32> max(const Vec<SignedByte, 32> &a,
1758 const Vec<SignedByte, 32> &b)
1759{
1760 return _mm256_max_epi8(a, b);
1761}
1762
1763static SIMD_INLINE Vec<Word, 32> max(const Vec<Word, 32> &a,
1764 const Vec<Word, 32> &b)
1765{
1766 return _mm256_max_epu16(a, b);
1767}
1768
1769static SIMD_INLINE Vec<Short, 32> max(const Vec<Short, 32> &a,
1770 const Vec<Short, 32> &b)
1771{
1772 return _mm256_max_epi16(a, b);
1773}
1774
1775static SIMD_INLINE Vec<Int, 32> max(const Vec<Int, 32> &a,
1776 const Vec<Int, 32> &b)
1777{
1778 return _mm256_max_epi32(a, b);
1779}
1780
1781// there is an unsigned version of max for 32 bit but we currently
1782// don't have an element type for it
1783
1784static SIMD_INLINE Vec<Long, 32> max(const Vec<Long, 32> &a,
1785 const Vec<Long, 32> &b)
1786{
1787 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
1788 const __m256i diff = _mm256_sub_epi64(b, a);
1789#if 1 // TODO: check which is faster
1790 const __m256i res = _mm256_xor_si256(
1791 diff, _mm256_and_si256(_mm256_xor_si256(b, a), _mm256_xor_si256(diff, b)));
1792#else
1793 const __m256i res =
1794 _mm256_or_si256(_mm256_andnot_si256(a, b),
1795 _mm256_andnot_si256(_mm256_xor_si256(b, a), diff));
1796#endif
1797 // result in highest bit of res
1798 // spread highest bit to all bits
1799 const __m256i spread32 = _mm256_srai_epi32(res, 31);
1800 const __m256i gt = _mm256_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1801
1802 // blend a and b according to gt
1803 return _mm256_blendv_epi8(b, a, gt);
1804}
1805
1806#else
1807
1808// non-avx2 workaround
1809template <typename T>
1810static SIMD_INLINE Vec<T, 32> max(const Vec<T, 32> &a, const Vec<T, 32> &b)
1811{
1812 return Vec<T, 32>(max(a.lo(), b.lo()), max(a.hi(), b.hi()));
1813}
1814
1815#endif
1816
1817static SIMD_INLINE Vec<Float, 32> max(const Vec<Float, 32> &a,
1818 const Vec<Float, 32> &b)
1819{
1820 return _mm256_max_ps(a, b);
1821}
1822
1823static SIMD_INLINE Vec<Double, 32> max(const Vec<Double, 32> &a,
1824 const Vec<Double, 32> &b)
1825{
1826 return _mm256_max_pd(a, b);
1827}
1828
1829// ---------------------------------------------------------------------------
1830// mul, div
1831// ---------------------------------------------------------------------------
1832
1833// TODO: add mul/div versions for int types? or make special versions of mul
1834// TODO: and div where the result is scaled?
1835
1836static SIMD_INLINE Vec<Float, 32> mul(const Vec<Float, 32> &a,
1837 const Vec<Float, 32> &b)
1838{
1839 return _mm256_mul_ps(a, b);
1840}
1841
1842static SIMD_INLINE Vec<Double, 32> mul(const Vec<Double, 32> &a,
1843 const Vec<Double, 32> &b)
1844{
1845 return _mm256_mul_pd(a, b);
1846}
1847
1848static SIMD_INLINE Vec<Float, 32> div(const Vec<Float, 32> &a,
1849 const Vec<Float, 32> &b)
1850{
1851 return _mm256_div_ps(a, b);
1852}
1853
1854static SIMD_INLINE Vec<Double, 32> div(const Vec<Double, 32> &a,
1855 const Vec<Double, 32> &b)
1856{
1857 return _mm256_div_pd(a, b);
1858}
1859
1860// ---------------------------------------------------------------------------
1861// ceil, floor, round, truncate
1862// ---------------------------------------------------------------------------
1863
1864// 25. Mar 23 (Jonas Keller): added versions for integer types
1865
1866// versions for integer types do nothing:
1867
1868template <typename T>
1869static SIMD_INLINE Vec<T, 32> ceil(const Vec<T, 32> &a)
1870{
1871 static_assert(std::is_integral<T>::value, "");
1872 return a;
1873}
1874
1875template <typename T>
1876static SIMD_INLINE Vec<T, 32> floor(const Vec<T, 32> &a)
1877{
1878 static_assert(std::is_integral<T>::value, "");
1879 return a;
1880}
1881
1882template <typename T>
1883static SIMD_INLINE Vec<T, 32> round(const Vec<T, 32> &a)
1884{
1885 static_assert(std::is_integral<T>::value, "");
1886 return a;
1887}
1888
1889template <typename T>
1890static SIMD_INLINE Vec<T, 32> truncate(const Vec<T, 32> &a)
1891{
1892 static_assert(std::is_integral<T>::value, "");
1893 return a;
1894}
1895
1896static SIMD_INLINE Vec<Float, 32> ceil(const Vec<Float, 32> &a)
1897{
1898 return _mm256_ceil_ps(a);
1899}
1900
1901static SIMD_INLINE Vec<Double, 32> ceil(const Vec<Double, 32> &a)
1902{
1903 return _mm256_ceil_pd(a);
1904}
1905
1906static SIMD_INLINE Vec<Float, 32> floor(const Vec<Float, 32> &a)
1907{
1908 return _mm256_floor_ps(a);
1909}
1910
1911static SIMD_INLINE Vec<Double, 32> floor(const Vec<Double, 32> &a)
1912{
1913 return _mm256_floor_pd(a);
1914}
1915
1916static SIMD_INLINE Vec<Float, 32> round(const Vec<Float, 32> &a)
1917{
1918 // old: use _MM_SET_ROUNDING_MODE to adjust rounding direction
1919 // return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1920 // new 4. Aug 16 (rm): round to nearest, and suppress exceptions
1921 return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1922}
1923
1924static SIMD_INLINE Vec<Double, 32> round(const Vec<Double, 32> &a)
1925{
1926 return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1927}
1928
1929static SIMD_INLINE Vec<Float, 32> truncate(const Vec<Float, 32> &a)
1930{
1931 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1932}
1933
1934static SIMD_INLINE Vec<Double, 32> truncate(const Vec<Double, 32> &a)
1935{
1936 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1937}
1938
1939// ---------------------------------------------------------------------------
1940// elementary mathematical functions
1941// ---------------------------------------------------------------------------
1942
1943// estimate of a reciprocal
1944static SIMD_INLINE Vec<Float, 32> rcp(const Vec<Float, 32> &a)
1945{
1946 return _mm256_rcp_ps(a);
1947}
1948
1949static SIMD_INLINE Vec<Double, 32> rcp(const Vec<Double, 32> &a)
1950{
1951 // _mm256_rcp_pd does not exist
1952 return Vec<Double, 32>(rcp(a.lo()), rcp(a.hi()));
1953}
1954
1955// estimate of reverse square root
1956static SIMD_INLINE Vec<Float, 32> rsqrt(const Vec<Float, 32> &a)
1957{
1958 return _mm256_rsqrt_ps(a);
1959}
1960
1961static SIMD_INLINE Vec<Double, 32> rsqrt(const Vec<Double, 32> &a)
1962{
1963 // _mm256_rsqrt_pd does not exist
1964 return Vec<Double, 32>(rsqrt(a.lo()), rsqrt(a.hi()));
1965}
1966
1967// square root
1968static SIMD_INLINE Vec<Float, 32> sqrt(const Vec<Float, 32> &a)
1969{
1970 return _mm256_sqrt_ps(a);
1971}
1972
1973static SIMD_INLINE Vec<Double, 32> sqrt(const Vec<Double, 32> &a)
1974{
1975 return _mm256_sqrt_pd(a);
1976}
1977
1978// ---------------------------------------------------------------------------
1979// abs
1980// ---------------------------------------------------------------------------
1981
1982// 25. Mar 25 (Jonas Keller): added abs for unsigned integers
1983
1984// unsigned integers
1985template <typename T, SIMD_ENABLE_IF(std::is_unsigned<T>::value
1986 &&std::is_integral<T>::value)>
1987static SIMD_INLINE Vec<T, 32> abs(const Vec<T, 32> &a)
1988{
1989 return a;
1990}
1991
1992static SIMD_INLINE Vec<SignedByte, 32> abs(const Vec<SignedByte, 32> &a)
1993{
1994#ifdef __AVX2__
1995 return _mm256_abs_epi8(a);
1996#else
1997 // non-avx2 workaround
1998 return Vec<SignedByte, 32>(abs(a.lo()), abs(a.hi()));
1999#endif
2000}
2001
2002static SIMD_INLINE Vec<Short, 32> abs(const Vec<Short, 32> &a)
2003{
2004#ifdef __AVX2__
2005 return _mm256_abs_epi16(a);
2006#else
2007 // non-avx2 workaround
2008 return Vec<Short, 32>(abs(a.lo()), abs(a.hi()));
2009#endif
2010}
2011
2012static SIMD_INLINE Vec<Int, 32> abs(const Vec<Int, 32> &a)
2013{
2014#ifdef __AVX2__
2015 return _mm256_abs_epi32(a);
2016#else
2017 // non-avx2 workaround
2018 return Vec<Int, 32>(abs(a.lo()), abs(a.hi()));
2019#endif
2020}
2021
2022static SIMD_INLINE Vec<Long, 32> abs(const Vec<Long, 32> &a)
2023{
2024#ifdef __AVX2__
2025 // _mm256_abs_epi64 is only supported in avx512
2026 // from Hacker's Delight, 2-4 Absolute Value Function:
2027 const __m256i signMask =
2028 _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1));
2029 return _mm256_sub_epi64(_mm256_xor_si256(a, signMask), signMask);
2030#else
2031 // non-avx2 workaround
2032 return Vec<Long, 32>(abs(a.lo()), abs(a.hi()));
2033#endif
2034}
2035
2036static SIMD_INLINE Vec<Float, 32> abs(const Vec<Float, 32> &a)
2037{
2038 // there's no _mm256_abs_ps, we have to emulated it:
2039 // -0.0F is 0x8000000, 0x7fffffff by andnot, sign bit is cleared
2040 return _mm256_andnot_ps(_mm256_set1_ps(-0.0F), a);
2041}
2042
2043static SIMD_INLINE Vec<Double, 32> abs(const Vec<Double, 32> &a)
2044{
2045 // there's no _mm256_abs_pd, we have to emulated it:
2046 // -0.0 is 0x8000000000000000, 0x7fffffffffffffff by andnot, sign bit is
2047 // cleared
2048 return _mm256_andnot_pd(_mm256_set1_pd(-0.0), a);
2049}
2050
2051// ---------------------------------------------------------------------------
2052// unpacklo
2053// ---------------------------------------------------------------------------
2054
2055// all integer versions
2056template <typename T>
2057static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2058 Part<0>, Bytes<1>)
2059{
2060 return x_mm256_unpacklo_epi8(x_mm256_transpose4x64_epi64(a),
2061 x_mm256_transpose4x64_epi64(b));
2062}
2063
2064// all integer versions
2065template <typename T>
2066static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2067 Part<0>, Bytes<2>)
2068{
2069 return x_mm256_unpacklo_epi16(x_mm256_transpose4x64_epi64(a),
2070 x_mm256_transpose4x64_epi64(b));
2071}
2072
2073// all integer versions
2074template <typename T>
2075static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2076 Part<0>, Bytes<4>)
2077{
2078 return x_mm256_unpacklo_epi32(x_mm256_transpose4x64_epi64(a),
2079 x_mm256_transpose4x64_epi64(b));
2080}
2081
2082// all integer versions
2083template <typename T>
2084static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2085 Part<0>, Bytes<8>)
2086{
2087 return x_mm256_unpacklo_epi64(x_mm256_transpose4x64_epi64(a),
2088 x_mm256_transpose4x64_epi64(b));
2089}
2090
2091// all integer versions
2092template <typename T>
2093static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2094 Part<0>, Bytes<16>)
2095{
2096 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2097}
2098
2099// float version
2100static SIMD_INLINE Vec<Float, 32> unpack(const Vec<Float, 32> &a,
2101 const Vec<Float, 32> &b, Part<0>,
2102 Bytes<4>)
2103{
2104 return _mm256_unpacklo_ps(x_mm256_transpose4x64_ps(a),
2105 x_mm256_transpose4x64_ps(b));
2106}
2107
2108// float versions
2109static SIMD_INLINE Vec<Float, 32> unpack(const Vec<Float, 32> &a,
2110 const Vec<Float, 32> &b, Part<0>,
2111 Bytes<8>)
2112{
2113 return x_mm256_unpacklo_2ps(x_mm256_transpose4x64_ps(a),
2114 x_mm256_transpose4x64_ps(b));
2115}
2116
2117// float version
2118static SIMD_INLINE Vec<Float, 32> unpack(const Vec<Float, 32> &a,
2119 const Vec<Float, 32> &b, Part<0>,
2120 Bytes<16>)
2121{
2122 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2123}
2124
2125// double version
2126static SIMD_INLINE Vec<Double, 32> unpack(const Vec<Double, 32> &a,
2127 const Vec<Double, 32> &b, Part<0>,
2128 Bytes<8>)
2129{
2130 return _mm256_unpacklo_pd(x_mm256_transpose4x64_pd(a),
2131 x_mm256_transpose4x64_pd(b));
2132}
2133
2134// double version
2135static SIMD_INLINE Vec<Double, 32> unpack(const Vec<Double, 32> &a,
2136 const Vec<Double, 32> &b, Part<0>,
2137 Bytes<16>)
2138{
2139 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2140}
2141
2142// ---------------------------------------------------------------------------
2143// unpackhi
2144// ---------------------------------------------------------------------------
2145
2146// all integer versions
2147template <typename T>
2148static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2149 Part<1>, Bytes<1>)
2150{
2151 return x_mm256_unpackhi_epi8(x_mm256_transpose4x64_epi64(a),
2152 x_mm256_transpose4x64_epi64(b));
2153}
2154
2155// all integer versions
2156template <typename T>
2157static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2158 Part<1>, Bytes<2>)
2159{
2160 return x_mm256_unpackhi_epi16(x_mm256_transpose4x64_epi64(a),
2161 x_mm256_transpose4x64_epi64(b));
2162}
2163
2164// all integer versions
2165template <typename T>
2166static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2167 Part<1>, Bytes<4>)
2168{
2169 return x_mm256_unpackhi_epi32(x_mm256_transpose4x64_epi64(a),
2170 x_mm256_transpose4x64_epi64(b));
2171}
2172
2173// all integer versions
2174template <typename T>
2175static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2176 Part<1>, Bytes<8>)
2177{
2178 return x_mm256_unpackhi_epi64(x_mm256_transpose4x64_epi64(a),
2179 x_mm256_transpose4x64_epi64(b));
2180}
2181
2182// all integer versions
2183template <typename T>
2184static SIMD_INLINE Vec<T, 32> unpack(const Vec<T, 32> &a, const Vec<T, 32> &b,
2185 Part<1>, Bytes<16>)
2186{
2187 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2188}
2189
2190// float version
2191static SIMD_INLINE Vec<Float, 32> unpack(const Vec<Float, 32> &a,
2192 const Vec<Float, 32> &b, Part<1>,
2193 Bytes<4>)
2194{
2195 return _mm256_unpackhi_ps(x_mm256_transpose4x64_ps(a),
2196 x_mm256_transpose4x64_ps(b));
2197}
2198
2199// float version
2200static SIMD_INLINE Vec<Float, 32> unpack(const Vec<Float, 32> &a,
2201 const Vec<Float, 32> &b, Part<1>,
2202 Bytes<8>)
2203{
2204 return x_mm256_unpackhi_2ps(x_mm256_transpose4x64_ps(a),
2205 x_mm256_transpose4x64_ps(b));
2206}
2207
2208// float version
2209static SIMD_INLINE Vec<Float, 32> unpack(const Vec<Float, 32> &a,
2210 const Vec<Float, 32> &b, Part<1>,
2211 Bytes<16>)
2212{
2213 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2214}
2215
2216// double version
2217static SIMD_INLINE Vec<Double, 32> unpack(const Vec<Double, 32> &a,
2218 const Vec<Double, 32> &b, Part<1>,
2219 Bytes<8>)
2220{
2221 return _mm256_unpackhi_pd(x_mm256_transpose4x64_pd(a),
2222 x_mm256_transpose4x64_pd(b));
2223}
2224
2225// double version
2226static SIMD_INLINE Vec<Double, 32> unpack(const Vec<Double, 32> &a,
2227 const Vec<Double, 32> &b, Part<1>,
2228 Bytes<16>)
2229{
2230 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2231}
2232
2233// ---------------------------------------------------------------------------
2234// 16-byte-lane oriented unpacklo
2235// ---------------------------------------------------------------------------
2236
2237// contributed by Adam Marschall
2238
2239// all integer versions
2240template <typename T>
2241static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2242 Part<0>, Bytes<1>)
2243{
2244 return x_mm256_unpacklo_epi8(a, b);
2245}
2246
2247// all integer versions
2248template <typename T>
2249static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2250 Part<0>, Bytes<2>)
2251{
2252 return x_mm256_unpacklo_epi16(a, b);
2253}
2254
2255// all integer versions
2256template <typename T>
2257static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2258 Part<0>, Bytes<4>)
2259{
2260 return x_mm256_unpacklo_epi32(a, b);
2261}
2262
2263// all integer versions
2264template <typename T>
2265static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2266 Part<0>, Bytes<8>)
2267{
2268 return x_mm256_unpacklo_epi64(a, b);
2269}
2270
2271// all integer versions
2272template <typename T>
2273static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2274 Part<0>, Bytes<16>)
2275{
2276 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2277}
2278
2279// float version
2280static SIMD_INLINE Vec<Float, 32> unpack16(const Vec<Float, 32> &a,
2281 const Vec<Float, 32> &b, Part<0>,
2282 Bytes<4>)
2283{
2284 return _mm256_unpacklo_ps(a, b);
2285}
2286
2287// float versions
2288static SIMD_INLINE Vec<Float, 32> unpack16(const Vec<Float, 32> &a,
2289 const Vec<Float, 32> &b, Part<0>,
2290 Bytes<8>)
2291{
2292 return x_mm256_unpacklo_2ps(a, b);
2293}
2294
2295// float version
2296static SIMD_INLINE Vec<Float, 32> unpack16(const Vec<Float, 32> &a,
2297 const Vec<Float, 32> &b, Part<0>,
2298 Bytes<16>)
2299{
2300 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2301}
2302
2303// double version
2304static SIMD_INLINE Vec<Double, 32> unpack16(const Vec<Double, 32> &a,
2305 const Vec<Double, 32> &b, Part<0>,
2306 Bytes<8>)
2307{
2308 return _mm256_unpacklo_pd(a, b);
2309}
2310
2311// double version
2312static SIMD_INLINE Vec<Double, 32> unpack16(const Vec<Double, 32> &a,
2313 const Vec<Double, 32> &b, Part<0>,
2314 Bytes<16>)
2315{
2316 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2317}
2318
2319// ---------------------------------------------------------------------------
2320// 128-bit-lane oriented unpackhi
2321// ---------------------------------------------------------------------------
2322
2323// all integer versions
2324template <typename T>
2325static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2326 Part<1>, Bytes<1>)
2327{
2328 return x_mm256_unpackhi_epi8(a, b);
2329}
2330
2331// all integer versions
2332template <typename T>
2333static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2334 Part<1>, Bytes<2>)
2335{
2336 return x_mm256_unpackhi_epi16(a, b);
2337}
2338
2339// all integer versions
2340template <typename T>
2341static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2342 Part<1>, Bytes<4>)
2343{
2344 return x_mm256_unpackhi_epi32(a, b);
2345}
2346
2347// all integer versions
2348template <typename T>
2349static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2350 Part<1>, Bytes<8>)
2351{
2352 return x_mm256_unpackhi_epi64(a, b);
2353}
2354
2355// all integer versions
2356template <typename T>
2357static SIMD_INLINE Vec<T, 32> unpack16(const Vec<T, 32> &a, const Vec<T, 32> &b,
2358 Part<1>, Bytes<16>)
2359{
2360 return _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2361}
2362
2363// float version
2364static SIMD_INLINE Vec<Float, 32> unpack16(const Vec<Float, 32> &a,
2365 const Vec<Float, 32> &b, Part<1>,
2366 Bytes<4>)
2367{
2368 return _mm256_unpackhi_ps(a, b);
2369}
2370
2371// float version
2372static SIMD_INLINE Vec<Float, 32> unpack16(const Vec<Float, 32> &a,
2373 const Vec<Float, 32> &b, Part<1>,
2374 Bytes<8>)
2375{
2376 return x_mm256_unpackhi_2ps(a, b);
2377}
2378
2379// float version
2380static SIMD_INLINE Vec<Float, 32> unpack16(const Vec<Float, 32> &a,
2381 const Vec<Float, 32> &b, Part<1>,
2382 Bytes<16>)
2383{
2384 return _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2385}
2386
2387// double version
2388static SIMD_INLINE Vec<Double, 32> unpack16(const Vec<Double, 32> &a,
2389 const Vec<Double, 32> &b, Part<1>,
2390 Bytes<8>)
2391{
2392 return _mm256_unpackhi_pd(a, b);
2393}
2394
2395// double version
2396static SIMD_INLINE Vec<Double, 32> unpack16(const Vec<Double, 32> &a,
2397 const Vec<Double, 32> &b, Part<1>,
2398 Bytes<16>)
2399{
2400 return _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2401}
2402
2403// ---------------------------------------------------------------------------
2404// extract 128-bit-lane as Vec<T, 16>
2405// ---------------------------------------------------------------------------
2406
2407// contributed by Adam Marschall
2408
2409// generalized extract of 128-bit-lanes
2410// LANE_INDEX=0: first lane of input vector,
2411// LANE_INDEX=1: second lane of input vector
2412template <size_t LANE_INDEX, typename T>
2413static SIMD_INLINE Vec<T, 16> extractLane(const Vec<T, 32> &a)
2414{
2415 const auto intA = reinterpret(a, OutputType<Int>());
2416 const Vec<Int, 16> intRes = _mm256_extractf128_si256(intA, LANE_INDEX);
2417 return reinterpret(intRes, OutputType<T>());
2418}
2419
2420// ---------------------------------------------------------------------------
2421// zip
2422// ---------------------------------------------------------------------------
2423
2424// a, b are passed by-value to avoid problems with identical
2425// input/output args.
2426
2427// here we typically have to transpose the inputs in the same way
2428// for both output computations, so we define separate functions for
2429// all T and Bytes<> (combinations of unpack functions above)
2430
2431// all integer versions
2432template <typename T>
2433static SIMD_INLINE void zip(const Vec<T, 32> a, const Vec<T, 32> b,
2434 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<1>)
2435{
2436 __m256i at = x_mm256_transpose4x64_epi64(a);
2437 __m256i bt = x_mm256_transpose4x64_epi64(b);
2438 l = x_mm256_unpacklo_epi8(at, bt);
2439 h = x_mm256_unpackhi_epi8(at, bt);
2440}
2441
2442// all integer versions
2443template <typename T>
2444static SIMD_INLINE void zip(const Vec<T, 32> a, const Vec<T, 32> b,
2445 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<2>)
2446{
2447 __m256i at = x_mm256_transpose4x64_epi64(a);
2448 __m256i bt = x_mm256_transpose4x64_epi64(b);
2449 l = x_mm256_unpacklo_epi16(at, bt);
2450 h = x_mm256_unpackhi_epi16(at, bt);
2451}
2452
2453// all integer versions
2454template <typename T>
2455static SIMD_INLINE void zip(const Vec<T, 32> a, const Vec<T, 32> b,
2456 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<4>)
2457{
2458 __m256i at = x_mm256_transpose4x64_epi64(a);
2459 __m256i bt = x_mm256_transpose4x64_epi64(b);
2460 l = x_mm256_unpacklo_epi32(at, bt);
2461 h = x_mm256_unpackhi_epi32(at, bt);
2462}
2463
2464// all integer versions
2465template <typename T>
2466static SIMD_INLINE void zip(const Vec<T, 32> a, const Vec<T, 32> b,
2467 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<8>)
2468{
2469 __m256i at = x_mm256_transpose4x64_epi64(a);
2470 __m256i bt = x_mm256_transpose4x64_epi64(b);
2471 l = x_mm256_unpacklo_epi64(at, bt);
2472 h = x_mm256_unpackhi_epi64(at, bt);
2473}
2474
2475// all integer versions
2476template <typename T>
2477static SIMD_INLINE void zip(const Vec<T, 32> a, const Vec<T, 32> b,
2478 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<16>)
2479{
2480 l = _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2481 h = _mm256_permute2f128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2482}
2483
2484// float version
2485static SIMD_INLINE void zip(const Vec<Float, 32> a, const Vec<Float, 32> b,
2486 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<4>)
2487{
2488 __m256 at = x_mm256_transpose4x64_ps(a);
2489 __m256 bt = x_mm256_transpose4x64_ps(b);
2490 l = _mm256_unpacklo_ps(at, bt);
2491 h = _mm256_unpackhi_ps(at, bt);
2492}
2493
2494// float version
2495static SIMD_INLINE void zip(const Vec<Float, 32> a, const Vec<Float, 32> b,
2496 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<8>)
2497{
2498 __m256 at = x_mm256_transpose4x64_ps(a);
2499 __m256 bt = x_mm256_transpose4x64_ps(b);
2500 l = x_mm256_unpacklo_2ps(at, bt);
2501 h = x_mm256_unpackhi_2ps(at, bt);
2502}
2503
2504// float version
2505static SIMD_INLINE void zip(const Vec<Float, 32> a, const Vec<Float, 32> b,
2506 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<16>)
2507{
2508 l = _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2509 h = _mm256_permute2f128_ps(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2510}
2511
2512// double version
2513static SIMD_INLINE void zip(const Vec<Double, 32> a, const Vec<Double, 32> b,
2514 Vec<Double, 32> &l, Vec<Double, 32> &h, Bytes<8>)
2515{
2516 __m256d at = x_mm256_transpose4x64_pd(a);
2517 __m256d bt = x_mm256_transpose4x64_pd(b);
2518 l = _mm256_unpacklo_pd(at, bt);
2519 h = _mm256_unpackhi_pd(at, bt);
2520}
2521
2522// double version
2523static SIMD_INLINE void zip(const Vec<Double, 32> a, const Vec<Double, 32> b,
2524 Vec<Double, 32> &l, Vec<Double, 32> &h, Bytes<16>)
2525{
2526 l = _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 2, 0, 0));
2527 h = _mm256_permute2f128_pd(a, b, _MM_SHUFFLE(0, 3, 0, 1));
2528}
2529
2530// ---------------------------------------------------------------------------
2531// zip hub
2532// ---------------------------------------------------------------------------
2533
2534// zips blocks of NUM_ELEMS elements of type T
2535template <size_t NUM_ELEMS, typename T>
2536static SIMD_INLINE void zip(const Vec<T, 32> a, const Vec<T, 32> b,
2537 Vec<T, 32> &l, Vec<T, 32> &h)
2538{
2539 return zip(a, b, l, h, Bytes<NUM_ELEMS * sizeof(T)>());
2540}
2541
2542// ---------------------------------------------------------------------------
2543// zip16 hub (16-byte-lane oriented zip)
2544// ---------------------------------------------------------------------------
2545
2546// contributed by Adam Marschall
2547
2548// zips blocks of NUM_ELEMS elements of type T
2549template <size_t NUM_ELEMS, typename T>
2550static SIMD_INLINE void zip16(const Vec<T, 32> a, const Vec<T, 32> b,
2551 Vec<T, 32> &l, Vec<T, 32> &h)
2552{
2553 l = unpack16(a, b, Part<0>(), Bytes<NUM_ELEMS * sizeof(T)>());
2554 h = unpack16(a, b, Part<1>(), Bytes<NUM_ELEMS * sizeof(T)>());
2555}
2556
2557// ---------------------------------------------------------------------------
2558// unzip
2559// ---------------------------------------------------------------------------
2560
2561// a, b are passed by-value to avoid problems with identical input/output args.
2562
2563// here we typically have to transpose the inputs in the same way
2564// for both output computations, so we define separate functions for
2565// all T and Bytes<> (combinations of unpack functions above)
2566
2567// all integer versions
2568template <typename T>
2569static SIMD_INLINE void unzip(const Vec<T, 32> a, const Vec<T, 32> b,
2570 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<1>)
2571{
2572 // mask is hopefully only set once if unzip is used multiple times
2573 const __m256i mask =
2574 _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15,
2575 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
2576 const __m256i atmp =
2577 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(a, mask));
2578 const __m256i btmp =
2579 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(b, mask));
2580 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2581 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2582}
2583
2584// all integer versions
2585template <typename T>
2586static SIMD_INLINE void unzip(const Vec<T, 32> a, const Vec<T, 32> b,
2587 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<2>)
2588{
2589 // mask is hopefully only set once if unzip is used multiple times
2590 const __m256i mask =
2591 _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15,
2592 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
2593 const __m256i atmp =
2594 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(a, mask));
2595 const __m256i btmp =
2596 x_mm256_transpose4x64_epi64(x_mm256_shuffle_epi8(b, mask));
2597 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2598 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2599}
2600
2601// all integer versions
2602template <typename T>
2603static SIMD_INLINE void unzip(const Vec<T, 32> a, const Vec<T, 32> b,
2604 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<4>)
2605{
2606#ifdef __AVX2__
2607 const __m256i aShuffled = _mm256_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2608 const __m256i bShuffled = _mm256_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2609#else
2610 const __m256i aShuffled = _mm256_castps_si256(_mm256_shuffle_ps(
2611 _mm256_castsi256_ps(a), _mm256_castsi256_ps(a), _MM_SHUFFLE(3, 1, 2, 0)));
2612 const __m256i bShuffled = _mm256_castps_si256(_mm256_shuffle_ps(
2613 _mm256_castsi256_ps(b), _mm256_castsi256_ps(b), _MM_SHUFFLE(3, 1, 2, 0)));
2614#endif
2615 const __m256i atmp = x_mm256_transpose4x64_epi64(aShuffled);
2616 const __m256i btmp = x_mm256_transpose4x64_epi64(bShuffled);
2617 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2618 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2619}
2620
2621// all integer versions
2622template <typename T>
2623static SIMD_INLINE void unzip(const Vec<T, 32> a, const Vec<T, 32> b,
2624 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<8>)
2625{
2626 const __m256i atmp = x_mm256_transpose4x64_epi64(a);
2627 const __m256i btmp = x_mm256_transpose4x64_epi64(b);
2628 l = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2629 h = _mm256_permute2f128_si256(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2630}
2631
2632// all types
2633template <typename T>
2634static SIMD_INLINE void unzip(const Vec<T, 32> a, const Vec<T, 32> b,
2635 Vec<T, 32> &l, Vec<T, 32> &h, Bytes<16>)
2636{
2637 l = unpack(a, b, Part<0>(), Bytes<16>());
2638 h = unpack(a, b, Part<1>(), Bytes<16>());
2639}
2640
2641// float version
2642static SIMD_INLINE void unzip(const Vec<Float, 32> a, const Vec<Float, 32> b,
2643 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<4>)
2644{
2645 const __m256 atmp =
2646 x_mm256_transpose4x64_ps(_mm256_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 0)));
2647 const __m256 btmp =
2648 x_mm256_transpose4x64_ps(_mm256_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 2, 0)));
2649 l = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2650 h = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2651}
2652
2653// float version
2654static SIMD_INLINE void unzip(const Vec<Float, 32> a, const Vec<Float, 32> b,
2655 Vec<Float, 32> &l, Vec<Float, 32> &h, Bytes<8>)
2656{
2657 const __m256 atmp = x_mm256_transpose4x64_ps(a);
2658 const __m256 btmp = x_mm256_transpose4x64_ps(b);
2659 l = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2660 h = _mm256_permute2f128_ps(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2661}
2662
2663// double version
2664static SIMD_INLINE void unzip(const Vec<Double, 32> a, const Vec<Double, 32> b,
2665 Vec<Double, 32> &l, Vec<Double, 32> &h, Bytes<8>)
2666{
2667 const __m256d atmp = x_mm256_transpose4x64_pd(a);
2668 const __m256d btmp = x_mm256_transpose4x64_pd(b);
2669 l = _mm256_permute2f128_pd(atmp, btmp, _MM_SHUFFLE(0, 2, 0, 0));
2670 h = _mm256_permute2f128_pd(atmp, btmp, _MM_SHUFFLE(0, 3, 0, 1));
2671}
2672
2673// ---------------------------------------------------------------------------
2674// unzip hub
2675// ---------------------------------------------------------------------------
2676
2677// hub
2678template <size_t NUM_ELEMS, typename T>
2679static SIMD_INLINE void unzip(const Vec<T, 32> a, const Vec<T, 32> b,
2680 Vec<T, 32> &l, Vec<T, 32> &h)
2681{
2682 return unzip(a, b, l, h, Bytes<NUM_ELEMS * sizeof(T)>());
2683}
2684
2685// ---------------------------------------------------------------------------
2686// packs
2687// ---------------------------------------------------------------------------
2688
2689// ========== signed -> signed ==========
2690
2691static SIMD_INLINE Vec<SignedByte, 32> packs(const Vec<Short, 32> &a,
2692 const Vec<Short, 32> &b,
2693 OutputType<SignedByte>)
2694{
2695 return x_mm256_transpose4x64_epi64(x_mm256_packs_epi16(a, b));
2696}
2697
2698static SIMD_INLINE Vec<Short, 32> packs(const Vec<Int, 32> &a,
2699 const Vec<Int, 32> &b,
2700 OutputType<Short>)
2701{
2702 return x_mm256_transpose4x64_epi64(x_mm256_packs_epi32(a, b));
2703}
2704
2705static SIMD_INLINE Vec<Short, 32> packs(const Vec<Float, 32> &a,
2706 const Vec<Float, 32> &b,
2707 OutputType<Short>)
2708{
2709 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2710 OutputType<Short>());
2711}
2712
2713static SIMD_INLINE Vec<Float, 32> packs(const Vec<Long, 32> &a,
2714 const Vec<Long, 32> &b,
2715 OutputType<Float>)
2716{
2717 // _mm256_cvtepi64_ps is not available in avx
2718 return _mm256_set_m128(_mm256_cvtpd_ps(cvts(b, OutputType<Double>())),
2719 _mm256_cvtpd_ps(cvts(a, OutputType<Double>())));
2720}
2721
2722static SIMD_INLINE Vec<Int, 32> packs(const Vec<Long, 32> &a,
2723 const Vec<Long, 32> &b, OutputType<Int>)
2724{
2725 // _mm256_packs_epi64 is not available in avx
2726
2727#ifdef __AVX2__
2728 const auto maxClip = _mm256_set1_epi64x(0x000000007fffffff);
2729 const auto minClip = _mm256_set1_epi64x(0xffffffff80000000);
2730 const auto aSaturatedMin =
2731 _mm256_blendv_epi8(a, minClip, _mm256_cmpgt_epi64(minClip, a));
2732 const auto aSaturated =
2733 _mm256_blendv_epi8(aSaturatedMin, maxClip, _mm256_cmpgt_epi64(a, maxClip));
2734 const auto bSaturatedMin =
2735 _mm256_blendv_epi8(b, minClip, _mm256_cmpgt_epi64(minClip, b));
2736 const auto bSaturated =
2737 _mm256_blendv_epi8(bSaturatedMin, maxClip, _mm256_cmpgt_epi64(b, maxClip));
2738 return x_mm256_transpose4x64_epi64(_mm256_castps_si256(_mm256_shuffle_ps(
2739 _mm256_castsi256_ps(aSaturated), _mm256_castsi256_ps(bSaturated),
2740 _MM_SHUFFLE(2, 0, 2, 0))));
2741#else
2742
2743 // vectorized workaround for when AVX2 is not available seems to be
2744 // complicated, so just using serial workaround
2745 // TODO: is there a better, vectorized workaround?
2746
2747 Long input[8] SIMD_ATTR_ALIGNED(32);
2748 _mm256_store_si256((__m256i *) input, a);
2749 _mm256_store_si256((__m256i *) (input + 4), b);
2750 Int output[8] SIMD_ATTR_ALIGNED(32);
2751 for (int i = 0; i < 8; ++i) {
2752 output[i] =
2753 (Int) std::min(std::max(input[i], (Long) std::numeric_limits<Int>::min()),
2754 (Long) std::numeric_limits<Int>::max());
2755 }
2756 return _mm256_load_si256((__m256i *) output);
2757#endif
2758}
2759
2760static SIMD_INLINE Vec<Float, 32> packs(const Vec<Double, 32> &a,
2761 const Vec<Double, 32> &b,
2762 OutputType<Float>)
2763{
2764 return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
2765}
2766
2767static SIMD_INLINE Vec<Int, 32> packs(const Vec<Double, 32> &a,
2768 const Vec<Double, 32> &b, OutputType<Int>)
2769{
2770 const __m256d clip = _mm256_set1_pd(std::numeric_limits<Int>::max());
2771 return _mm256_set_m128i(_mm256_cvtpd_epi32(_mm256_min_pd(clip, b)),
2772 _mm256_cvtpd_epi32(_mm256_min_pd(clip, a)));
2773}
2774
2775// ========== unsigned -> unsigned ==========
2776
2777static SIMD_INLINE Vec<Byte, 32> packs(const Vec<Word, 32> &a,
2778 const Vec<Word, 32> &b, OutputType<Byte>)
2779{
2780#ifdef __AVX2__
2781 // _mm256_packus_epu16 does not exist, so saturate inputs to byte range and
2782 // then use _mm256_packus_epi16
2783 return x_mm256_transpose4x64_epi64(
2784 _mm256_packus_epi16(_mm256_min_epu16(a, _mm256_set1_epi16(0xff)),
2785 _mm256_min_epu16(b, _mm256_set1_epi16(0xff))));
2786#else
2787 return x_mm256_transpose4x64_epi64(
2788 Vec<Byte, 32>(packs(a.lo(), b.lo(), OutputType<Byte>()),
2789 packs(a.hi(), b.hi(), OutputType<Byte>())));
2790#endif
2791}
2792
2793// ========== signed -> unsigned ==========
2794
2795// non-avx2 workaround
2796static SIMD_INLINE Vec<Byte, 32> packs(const Vec<Short, 32> &a,
2797 const Vec<Short, 32> &b,
2798 OutputType<Byte>)
2799{
2800 return x_mm256_transpose4x64_epi64(x_mm256_packus_epi16(a, b));
2801}
2802
2803// non-avx2 workaround
2804static SIMD_INLINE Vec<Word, 32> packs(const Vec<Int, 32> &a,
2805 const Vec<Int, 32> &b, OutputType<Word>)
2806{
2807 return x_mm256_transpose4x64_epi64(x_mm256_packus_epi32(a, b));
2808}
2809
2810static SIMD_INLINE Vec<Word, 32> packs(const Vec<Float, 32> &a,
2811 const Vec<Float, 32> &b,
2812 OutputType<Word>)
2813{
2814 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2815 OutputType<Word>());
2816}
2817
2818// ========== unsigned -> signed ==========
2819static SIMD_INLINE Vec<SignedByte, 32> packs(const Vec<Word, 32> &a,
2820 const Vec<Word, 32> &b,
2821 OutputType<SignedByte>)
2822{
2823#ifdef __AVX2__
2824 // _mm256_packs_epu16 does not exist, so saturate inputs to signed byte range
2825 // and then use _mm256_packs_epi16
2826 return x_mm256_transpose4x64_epi64(
2827 _mm256_packs_epi16(_mm256_min_epu16(a, _mm256_set1_epi16(0x7f)),
2828 _mm256_min_epu16(b, _mm256_set1_epi16(0x7f))));
2829#else
2830 return x_mm256_transpose4x64_epi64(
2831 Vec<SignedByte, 32>(packs(a.lo(), b.lo(), OutputType<SignedByte>()),
2832 packs(a.hi(), b.hi(), OutputType<SignedByte>())));
2833#endif
2834}
2835
2836// ---------------------------------------------------------------------------
2837// generalized extend: no stage
2838// ---------------------------------------------------------------------------
2839
2840// combinations:
2841// - signed -> extended signed (sign extension)
2842// - unsigned -> extended unsigned (zero extension)
2843// - unsigned -> extended signed (zero extension)
2844// - signed -> extended unsigned (saturation and zero extension)
2845
2846// 7. Aug 16 (rm):
2847// tried to remove this to SIMDVecExt.H, but then we get ambiguities with
2848// non-avx2 workaround
2849
2850// same types
2851template <typename T>
2852static SIMD_INLINE void extend(const Vec<T, 32> &vIn, Vec<T, 32> vOut[1])
2853{
2854 vOut[0] = vIn;
2855}
2856
2857// same size, different types
2858
2859static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
2860 Vec<Byte, 32> vOut[1])
2861{
2862 vOut[0] = max(vIn, Vec<SignedByte, 32>(_mm256_setzero_si256()));
2863}
2864
2865static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn,
2866 Vec<SignedByte, 32> vOut[1])
2867{
2868 vOut[0] = min(vIn, Vec<Byte, 32>(_mm256_set1_epi8(0x7f)));
2869}
2870
2871static SIMD_INLINE void extend(const Vec<Short, 32> &vIn, Vec<Word, 32> vOut[1])
2872{
2873 vOut[0] = max(vIn, Vec<Short, 32>(_mm256_setzero_si256()));
2874}
2875
2876static SIMD_INLINE void extend(const Vec<Word, 32> &vIn, Vec<Short, 32> vOut[1])
2877{
2878 vOut[0] = min(vIn, Vec<Word, 32>(_mm256_set1_epi16(0x7fff)));
2879}
2880
2881// ---------------------------------------------------------------------------
2882// generalized extend: single stage
2883// ---------------------------------------------------------------------------
2884
2885#ifdef __AVX2__
2886
2887// signed -> signed
2888
2889static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
2890 Vec<Short, 32> vOut[2])
2891{
2892 vOut[0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vIn));
2893 vOut[1] = _mm256_cvtepi8_epi16(_mm256_extractf128_si256(vIn, 1));
2894}
2895
2896static SIMD_INLINE void extend(const Vec<Short, 32> &vIn, Vec<Int, 32> vOut[2])
2897{
2898 vOut[0] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vIn));
2899 vOut[1] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(vIn, 1));
2900}
2901
2902static SIMD_INLINE void extend(const Vec<Short, 32> &vIn,
2903 Vec<Float, 32> vOut[2])
2904{
2905 vOut[0] =
2906 _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(vIn)));
2907 vOut[1] =
2908 _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(vIn, 1)));
2909}
2910
2911static SIMD_INLINE void extend(const Vec<Int, 32> &vIn, Vec<Long, 32> vOut[2])
2912{
2913 vOut[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(vIn));
2914 vOut[1] = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(vIn, 1));
2915}
2916
2917static SIMD_INLINE void extend(const Vec<Int, 32> &vIn, Vec<Double, 32> vOut[2])
2918{
2919 vOut[0] = _mm256_cvtepi32_pd(_mm256_castsi256_si128(vIn));
2920 vOut[1] = _mm256_cvtepi32_pd(_mm256_extractf128_si256(vIn, 1));
2921}
2922
2923static SIMD_INLINE void extend(const Vec<Float, 32> &vIn, Vec<Long, 32> vOut[2])
2924{
2925 // _mm256_cvtps_epi64 is not available in avx
2926 const auto clipped =
2927 _mm256_min_ps(vIn, _mm256_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT64));
2928 vOut[0] =
2929 cvts(_mm256_cvtps_pd(_mm256_castps256_ps128(clipped)), OutputType<Long>());
2930 vOut[1] = cvts(_mm256_cvtps_pd(_mm256_extractf128_ps(clipped, 1)),
2931 OutputType<Long>());
2932}
2933
2934static SIMD_INLINE void extend(const Vec<Float, 32> &vIn,
2935 Vec<Double, 32> vOut[2])
2936{
2937 vOut[0] = _mm256_cvtps_pd(_mm256_castps256_ps128(vIn));
2938 vOut[1] = _mm256_cvtps_pd(_mm256_extractf128_ps(vIn, 1));
2939}
2940
2941// unsigned -> unsigned
2942
2943static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn, Vec<Word, 32> vOut[2])
2944{
2945 // there's no _mm256_cvtepu8_epu16()
2946 Vec<Byte, 32> zero = setzero(OutputType<Byte>(), Integer<32>());
2947 // 16. Jul 16 (rm): here we avoid to use generalized unpack from
2948 // SIMDVecExt.H
2949 vOut[0] = unpack(vIn, zero, Part<0>(), Bytes<1>());
2950 vOut[1] = unpack(vIn, zero, Part<1>(), Bytes<1>());
2951}
2952
2953// unsigned -> signed
2954
2955static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn, Vec<Short, 32> vOut[2])
2956{
2957 vOut[0] = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vIn));
2958 vOut[1] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(vIn, 1));
2959}
2960
2961static SIMD_INLINE void extend(const Vec<Word, 32> &vIn, Vec<Int, 32> vOut[2])
2962{
2963 vOut[0] = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(vIn));
2964 vOut[1] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(vIn, 1));
2965}
2966
2967static SIMD_INLINE void extend(const Vec<Word, 32> &vIn, Vec<Float, 32> vOut[2])
2968{
2969 vOut[0] =
2970 _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(vIn)));
2971 vOut[1] =
2972 _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extractf128_si256(vIn, 1)));
2973}
2974
2975// signed -> unsigned
2976
2977static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
2978 Vec<Word, 32> vOut[2])
2979{
2980 // there's no _mm256_cvtepi8_epu16()
2981 const Vec<SignedByte, 32> saturated =
2982 _mm256_max_epi8(vIn, _mm256_setzero_si256());
2983 const Vec<SignedByte, 32> zero = _mm256_setzero_si256();
2984 vOut[0] = unpack(saturated, zero, Part<0>(), Bytes<1>());
2985 vOut[1] = unpack(saturated, zero, Part<1>(), Bytes<1>());
2986}
2987
2988// ---------------------------------------------------------------------------
2989// generalized extend: two stages
2990// ---------------------------------------------------------------------------
2991
2992// signed -> signed
2993
2994static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
2995 Vec<Int, 32> vOut[4])
2996{
2997 __m128i vInLo128 = _mm256_castsi256_si128(vIn);
2998 vOut[0] = _mm256_cvtepi8_epi32(vInLo128);
2999 vOut[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(vInLo128, 8));
3000 __m128i vInHi128 = _mm256_extractf128_si256(vIn, 1);
3001 vOut[2] = _mm256_cvtepi8_epi32(vInHi128);
3002 vOut[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(vInHi128, 8));
3003}
3004
3005static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
3006 Vec<Float, 32> vOut[4])
3007{
3008 Vec<Int, 32> vTmp[4];
3009 extend(vIn, vTmp);
3010 for (size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
3011}
3012
3013static SIMD_INLINE void extend(const Vec<Short, 32> &vIn, Vec<Long, 32> vOut[4])
3014{
3015 Vec<Int, 32> vTmp[2];
3016 extend(vIn, vTmp);
3017 extend(vTmp[0], vOut);
3018 extend(vTmp[1], vOut + 2);
3019}
3020
3021static SIMD_INLINE void extend(const Vec<Short, 32> &vIn,
3022 Vec<Double, 32> vOut[4])
3023{
3024 Vec<Int, 32> vTmp[2];
3025 extend(vIn, vTmp);
3026 extend(vTmp[0], vOut);
3027 extend(vTmp[1], vOut + 2);
3028}
3029
3030// unsigned -> signed
3031
3032static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn, Vec<Int, 32> vOut[4])
3033{
3034 __m128i vInLo128 = _mm256_castsi256_si128(vIn);
3035 vOut[0] = _mm256_cvtepu8_epi32(vInLo128);
3036 vOut[1] = _mm256_cvtepu8_epi32(_mm_srli_si128(vInLo128, 8));
3037 __m128i vInHi128 = _mm256_extractf128_si256(vIn, 1);
3038 vOut[2] = _mm256_cvtepu8_epi32(vInHi128);
3039 vOut[3] = _mm256_cvtepu8_epi32(_mm_srli_si128(vInHi128, 8));
3040}
3041
3042static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn, Vec<Float, 32> vOut[4])
3043{
3044 Vec<Int, 32> vTmp[4];
3045 extend(vIn, vTmp);
3046 for (size_t i = 0; i < 4; i++) vOut[i] = cvts(vTmp[i], OutputType<Float>());
3047}
3048
3049static SIMD_INLINE void extend(const Vec<Word, 32> &vIn, Vec<Long, 32> vOut[4])
3050{
3051 Vec<Int, 32> vTmp[2];
3052 extend(vIn, vTmp);
3053 extend(vTmp[0], vOut);
3054 extend(vTmp[1], vOut + 2);
3055}
3056
3057static SIMD_INLINE void extend(const Vec<Word, 32> &vIn,
3058 Vec<Double, 32> vOut[4])
3059{
3060 Vec<Int, 32> vTmp[2];
3061 extend(vIn, vTmp);
3062 extend(vTmp[0], vOut);
3063 extend(vTmp[1], vOut + 2);
3064}
3065
3066// ---------------------------------------------------------------------------
3067// generalized extend: three stages
3068// ---------------------------------------------------------------------------
3069
3070// signed -> signed
3071
3072static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
3073 Vec<Long, 32> vOut[8])
3074{
3075 Vec<Int, 32> vTmp[4];
3076 extend(vIn, vTmp);
3077 extend(vTmp[0], vOut);
3078 extend(vTmp[1], vOut + 2);
3079 extend(vTmp[2], vOut + 4);
3080 extend(vTmp[3], vOut + 6);
3081}
3082
3083static SIMD_INLINE void extend(const Vec<SignedByte, 32> &vIn,
3084 Vec<Double, 32> vOut[8])
3085{
3086 Vec<Int, 32> vTmp[4];
3087 extend(vIn, vTmp);
3088 extend(vTmp[0], vOut);
3089 extend(vTmp[1], vOut + 2);
3090 extend(vTmp[2], vOut + 4);
3091 extend(vTmp[3], vOut + 6);
3092}
3093
3094// unsigned -> signed
3095
3096static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn, Vec<Long, 32> vOut[8])
3097{
3098 Vec<Int, 32> vTmp[4];
3099 extend(vIn, vTmp);
3100 extend(vTmp[0], vOut);
3101 extend(vTmp[1], vOut + 2);
3102 extend(vTmp[2], vOut + 4);
3103 extend(vTmp[3], vOut + 6);
3104}
3105
3106static SIMD_INLINE void extend(const Vec<Byte, 32> &vIn,
3107 Vec<Double, 32> vOut[8])
3108{
3109 Vec<Int, 32> vTmp[4];
3110 extend(vIn, vTmp);
3111 extend(vTmp[0], vOut);
3112 extend(vTmp[1], vOut + 2);
3113 extend(vTmp[2], vOut + 4);
3114 extend(vTmp[3], vOut + 6);
3115}
3116
3117#else // __AVX2__
3118
3119// ---------------------------------------------------------------------------
3120// generalized extend: non-avx2 workaround
3121// ---------------------------------------------------------------------------
3122
3123// non-avx2 workaround
3124template <typename Tout, typename Tin,
3125 SIMD_ENABLE_IF(sizeof(Tout) > sizeof(Tin))>
3126static SIMD_INLINE void extend(const Vec<Tin, 32> &vIn,
3127 Vec<Tout, 32> vOut[sizeof(Tout) / sizeof(Tin)])
3128{
3129 const size_t nOut = sizeof(Tout) / sizeof(Tin), nOutHalf = nOut / 2;
3130 Vec<Tout, 16> vOutLo16[nOut], vOutHi16[nOut];
3131 extend(vIn.lo(), vOutLo16);
3132 extend(vIn.hi(), vOutHi16);
3133 for (size_t i = 0; i < nOutHalf; i++) {
3134 vOut[i] = Vec<Tout, 32>(vOutLo16[2 * i], vOutLo16[2 * i + 1]);
3135 vOut[i + nOutHalf] = Vec<Tout, 32>(vOutHi16[2 * i], vOutHi16[2 * i + 1]);
3136 }
3137}
3138
3139#endif
3140
3141// ---------------------------------------------------------------------------
3142// generalized extend: special case int <-> float, long <-> double
3143// ---------------------------------------------------------------------------
3144
3145template <typename Tout, typename Tin,
3146 SIMD_ENABLE_IF(sizeof(Tin) == sizeof(Tout)),
3147 SIMD_ENABLE_IF(std::is_floating_point<Tin>::value !=
3148 std::is_floating_point<Tout>::value)>
3149static SIMD_INLINE void extend(const Vec<Tin, 32> &vIn, Vec<Tout, 32> vOut[1])
3150{
3151 vOut[0] = cvts(vIn, OutputType<Tout>());
3152}
3153
3154// ---------------------------------------------------------------------------
3155// srai
3156// ---------------------------------------------------------------------------
3157
3158#ifdef __AVX2__
3159// 16. Oct 22 (Jonas Keller): added missing Byte and SignedByte versions
3160
3161template <size_t COUNT>
3162static SIMD_INLINE Vec<Byte, 32> srai(const Vec<Byte, 32> &a)
3163{
3164 SIMD_IF_CONSTEXPR (COUNT < 8) {
3165 const __m256i odd = _mm256_srai_epi16(a, COUNT);
3166 const __m256i even = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), COUNT + 8);
3167 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3168 } else {
3169 // result should be all ones if a is negative, all zeros otherwise
3170 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3171 }
3172}
3173
3174template <size_t COUNT>
3175static SIMD_INLINE Vec<SignedByte, 32> srai(const Vec<SignedByte, 32> &a)
3176{
3177 SIMD_IF_CONSTEXPR (COUNT < 8) {
3178 const __m256i odd = _mm256_srai_epi16(a, COUNT);
3179 const __m256i even = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), COUNT + 8);
3180 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3181 } else {
3182 // result should be all ones if a is negative, all zeros otherwise
3183 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3184 }
3185}
3186
3187template <size_t COUNT>
3188static SIMD_INLINE Vec<Word, 32> srai(const Vec<Word, 32> &a)
3189{
3190 return _mm256_srai_epi16(a, vec::min(COUNT, 15ul));
3191}
3192
3193template <size_t COUNT>
3194static SIMD_INLINE Vec<Short, 32> srai(const Vec<Short, 32> &a)
3195{
3196 return _mm256_srai_epi16(a, vec::min(COUNT, 15ul));
3197}
3198
3199template <size_t COUNT>
3200static SIMD_INLINE Vec<Int, 32> srai(const Vec<Int, 32> &a)
3201{
3202 return _mm256_srai_epi32(a, vec::min(COUNT, 31ul));
3203}
3204
3205template <size_t COUNT>
3206static SIMD_INLINE Vec<Long, 32> srai(const Vec<Long, 32> &a)
3207{ // workaround from Hacker's Delight, 2–17 Double-Length Shifts, Shift right
3208 // double signed:
3209 const __m256i odd = _mm256_srai_epi32(a, vec::min(COUNT, 31ul));
3210 __m256i even;
3211 SIMD_IF_CONSTEXPR (COUNT < 32) {
3212 even =
3213 _mm256_or_si256(_mm256_srli_epi32(a, COUNT),
3214 _mm256_slli_epi32(_mm256_srli_si256(a, 4), 32 - COUNT));
3215 } else {
3216 even =
3217 _mm256_srai_epi32(_mm256_srli_si256(a, 4), vec::min(COUNT - 32, 31ul));
3218 }
3219 return _mm256_blend_epi16(even, odd, 0xcc);
3220}
3221
3222#else
3223
3224// non-avx2 workaround
3225template <size_t COUNT, typename T>
3226static SIMD_INLINE Vec<T, 32> srai(const Vec<T, 32> &a)
3227{
3228 return Vec<T, 32>(srai<COUNT>(a.lo()), srai<COUNT>(a.hi()));
3229}
3230
3231#endif
3232
3233// ---------------------------------------------------------------------------
3234// srli
3235// ---------------------------------------------------------------------------
3236
3237#ifdef __AVX2__
3238
3239// https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
3240// License: not specified
3241template <size_t COUNT>
3242static SIMD_INLINE Vec<Byte, 32> srli(const Vec<Byte, 32> &a)
3243{
3244 SIMD_IF_CONSTEXPR (COUNT < 8) {
3245 return _mm256_and_si256(_mm256_set1_epi8((int8_t) (0xff >> COUNT)),
3246 _mm256_srli_epi32(a, COUNT));
3247 } else {
3248 return _mm256_setzero_si256();
3249 }
3250}
3251
3252// https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
3253// License: not specified
3254template <size_t COUNT>
3255static SIMD_INLINE Vec<SignedByte, 32> srli(const Vec<SignedByte, 32> &a)
3256{
3257 SIMD_IF_CONSTEXPR (COUNT < 8) {
3258 return _mm256_and_si256(_mm256_set1_epi8((int8_t) (0xff >> COUNT)),
3259 _mm256_srli_epi32(a, COUNT));
3260 } else {
3261 return _mm256_setzero_si256();
3262 }
3263}
3264
3265template <size_t COUNT>
3266static SIMD_INLINE Vec<Word, 32> srli(const Vec<Word, 32> &a)
3267{
3268 SIMD_IF_CONSTEXPR (COUNT < 16) {
3269 return _mm256_srli_epi16(a, COUNT);
3270 } else {
3271 return _mm256_setzero_si256();
3272 }
3273}
3274
3275template <size_t COUNT>
3276static SIMD_INLINE Vec<Short, 32> srli(const Vec<Short, 32> &a)
3277{
3278 SIMD_IF_CONSTEXPR (COUNT < 16) {
3279 return _mm256_srli_epi16(a, COUNT);
3280 } else {
3281 return _mm256_setzero_si256();
3282 }
3283}
3284
3285template <size_t COUNT>
3286static SIMD_INLINE Vec<Int, 32> srli(const Vec<Int, 32> &a)
3287{
3288 SIMD_IF_CONSTEXPR (COUNT < 32) {
3289 return _mm256_srli_epi32(a, COUNT);
3290 } else {
3291 return _mm256_setzero_si256();
3292 }
3293}
3294
3295template <size_t COUNT>
3296static SIMD_INLINE Vec<Long, 32> srli(const Vec<Long, 32> &a)
3297{
3298 SIMD_IF_CONSTEXPR (COUNT < 64) {
3299 return _mm256_srli_epi64(a, COUNT);
3300 } else {
3301 return _mm256_setzero_si256();
3302 }
3303}
3304
3305#else
3306
3307// non-avx2 workaround
3308template <size_t COUNT, typename T>
3309static SIMD_INLINE Vec<T, 32> srli(const Vec<T, 32> &a)
3310{
3311 return Vec<T, 32>(srli<COUNT>(a.lo()), srli<COUNT>(a.hi()));
3312}
3313
3314#endif
3315
3316// ---------------------------------------------------------------------------
3317// slli
3318// ---------------------------------------------------------------------------
3319
3320#ifdef __AVX2__
3321
3322template <size_t COUNT>
3323static SIMD_INLINE Vec<Byte, 32> slli(const Vec<Byte, 32> &a)
3324{
3325 SIMD_IF_CONSTEXPR (COUNT < 8) {
3326 // https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
3327 // License: not specified
3328 return _mm256_and_si256(
3329 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
3330 _mm256_slli_epi32(a, COUNT));
3331 } else {
3332 return _mm256_setzero_si256();
3333 }
3334}
3335
3336template <size_t COUNT>
3337static SIMD_INLINE Vec<SignedByte, 32> slli(const Vec<SignedByte, 32> &a)
3338{
3339 SIMD_IF_CONSTEXPR (COUNT < 8) {
3340 // https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
3341 // License: not specified
3342 return _mm256_and_si256(
3343 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
3344 _mm256_slli_epi32(a, COUNT));
3345 } else {
3346 return _mm256_setzero_si256();
3347 }
3348}
3349
3350template <size_t COUNT>
3351static SIMD_INLINE Vec<Word, 32> slli(const Vec<Word, 32> &a)
3352{
3353 SIMD_IF_CONSTEXPR (COUNT < 16) {
3354 return _mm256_slli_epi16(a, COUNT);
3355 } else {
3356 return _mm256_setzero_si256();
3357 }
3358}
3359
3360template <size_t COUNT>
3361static SIMD_INLINE Vec<Short, 32> slli(const Vec<Short, 32> &a)
3362{
3363 SIMD_IF_CONSTEXPR (COUNT < 16) {
3364 return _mm256_slli_epi16(a, COUNT);
3365 } else {
3366 return _mm256_setzero_si256();
3367 }
3368}
3369
3370template <size_t COUNT>
3371static SIMD_INLINE Vec<Int, 32> slli(const Vec<Int, 32> &a)
3372{
3373 SIMD_IF_CONSTEXPR (COUNT < 32) {
3374 return _mm256_slli_epi32(a, COUNT);
3375 } else {
3376 return _mm256_setzero_si256();
3377 }
3378}
3379
3380template <size_t COUNT>
3381static SIMD_INLINE Vec<Long, 32> slli(const Vec<Long, 32> &a)
3382{
3383 SIMD_IF_CONSTEXPR (COUNT < 64) {
3384 return _mm256_slli_epi64(a, COUNT);
3385 } else {
3386 return _mm256_setzero_si256();
3387 }
3388}
3389
3390#else
3391
3392// non-avx2 workaround
3393template <size_t COUNT, typename T>
3394static SIMD_INLINE Vec<T, 32> slli(const Vec<T, 32> &a)
3395{
3396 return Vec<T, 32>(slli<COUNT>(a.lo()), slli<COUNT>(a.hi()));
3397}
3398
3399#endif
3400
3401// 19. Dec 22 (Jonas Keller): added sra, srl and sll functions
3402
3403// ---------------------------------------------------------------------------
3404// sra
3405// ---------------------------------------------------------------------------
3406
3407#ifdef __AVX2__
3408
3409static SIMD_INLINE Vec<Byte, 32> sra(const Vec<Byte, 32> &a,
3410 const uint8_t count)
3411{
3412 if (count >= 8) {
3413 // result should be all ones if a is negative, all zeros otherwise
3414 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3415 }
3416 const __m256i odd = _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3417 const __m256i even =
3418 _mm256_sra_epi16(_mm256_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
3419 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3420}
3421
3422static SIMD_INLINE Vec<SignedByte, 32> sra(const Vec<SignedByte, 32> &a,
3423 const uint8_t count)
3424{
3425 if (count >= 8) {
3426 // result should be all ones if a is negative, all zeros otherwise
3427 return _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
3428 }
3429 const __m256i odd = _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3430 const __m256i even =
3431 _mm256_sra_epi16(_mm256_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
3432 return _mm256_blendv_epi8(even, odd, _mm256_set1_epi16((int16_t) 0xff00));
3433}
3434
3435static SIMD_INLINE Vec<Word, 32> sra(const Vec<Word, 32> &a,
3436 const uint8_t count)
3437{
3438 return _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3439}
3440
3441static SIMD_INLINE Vec<Short, 32> sra(const Vec<Short, 32> &a,
3442 const uint8_t count)
3443{
3444 return _mm256_sra_epi16(a, _mm_cvtsi32_si128(count));
3445}
3446
3447static SIMD_INLINE Vec<Int, 32> sra(const Vec<Int, 32> &a, const uint8_t count)
3448{
3449 return _mm256_sra_epi32(a, _mm_cvtsi32_si128(count));
3450}
3451
3452static SIMD_INLINE Vec<Long, 32> sra(const Vec<Long, 32> &a,
3453 const uint8_t count)
3454{
3455 // workaround from Hacker's Delight, 2–17 Double-Length Shifts, Shift right
3456 // double signed:
3457 const __m256i odd = _mm256_sra_epi32(a, _mm_cvtsi32_si128(count));
3458 __m256i even;
3459 if (count < 32) {
3460 even = _mm256_or_si256(
3461 _mm256_srl_epi32(a, _mm_cvtsi32_si128(count)),
3462 _mm256_sll_epi32(_mm256_srli_si256(a, 4), _mm_cvtsi32_si128(32 - count)));
3463 } else {
3464 even =
3465 _mm256_sra_epi32(_mm256_srli_si256(a, 4), _mm_cvtsi32_si128(count - 32));
3466 }
3467 return _mm256_blend_epi16(even, odd, 0xcc);
3468}
3469
3470#else
3471
3472// non-avx2 workaround
3473template <typename T>
3474static SIMD_INLINE Vec<T, 32> sra(const Vec<T, 32> &a, const uint8_t count)
3475{
3476 return Vec<T, 32>(sra(a.lo(), count), sra(a.hi(), count));
3477}
3478
3479#endif
3480
3481// ---------------------------------------------------------------------------
3482// srl
3483// ---------------------------------------------------------------------------
3484
3485#ifdef __AVX2__
3486
3487static SIMD_INLINE Vec<Byte, 32> srl(const Vec<Byte, 32> &a,
3488 const uint8_t count)
3489{
3490 return _mm256_and_si256(_mm256_srl_epi16(a, _mm_cvtsi32_si128(count)),
3491 _mm256_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3492}
3493
3494static SIMD_INLINE Vec<SignedByte, 32> srl(const Vec<SignedByte, 32> &a,
3495 const uint8_t count)
3496{
3497 return _mm256_and_si256(_mm256_srl_epi16(a, _mm_cvtsi32_si128(count)),
3498 _mm256_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
3499}
3500
3501static SIMD_INLINE Vec<Word, 32> srl(const Vec<Word, 32> &a,
3502 const uint8_t count)
3503{
3504 return _mm256_srl_epi16(a, _mm_cvtsi32_si128(count));
3505}
3506
3507static SIMD_INLINE Vec<Short, 32> srl(const Vec<Short, 32> &a,
3508 const uint8_t count)
3509{
3510 return _mm256_srl_epi16(a, _mm_cvtsi32_si128(count));
3511}
3512
3513static SIMD_INLINE Vec<Int, 32> srl(const Vec<Int, 32> &a, const uint8_t count)
3514{
3515 return _mm256_srl_epi32(a, _mm_cvtsi32_si128(count));
3516}
3517
3518static SIMD_INLINE Vec<Long, 32> srl(const Vec<Long, 32> &a,
3519 const uint8_t count)
3520{
3521 return _mm256_srl_epi64(a, _mm_cvtsi32_si128(count));
3522}
3523
3524#else
3525
3526// non-avx2 workaround
3527template <typename T>
3528static SIMD_INLINE Vec<T, 32> srl(const Vec<T, 32> &a, const uint8_t count)
3529{
3530 return Vec<T, 32>(srl(a.lo(), count), srl(a.hi(), count));
3531}
3532
3533#endif
3534
3535// ---------------------------------------------------------------------------
3536// sll
3537// ---------------------------------------------------------------------------
3538
3539#ifdef __AVX2__
3540
3541static SIMD_INLINE Vec<Byte, 32> sll(const Vec<Byte, 32> &a,
3542 const uint8_t count)
3543{
3544 return _mm256_and_si256(
3545 _mm256_sll_epi16(a, _mm_cvtsi32_si128(count)),
3546 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3547}
3548
3549static SIMD_INLINE Vec<SignedByte, 32> sll(const Vec<SignedByte, 32> &a,
3550 const uint8_t count)
3551{
3552 return _mm256_and_si256(
3553 _mm256_sll_epi16(a, _mm_cvtsi32_si128(count)),
3554 _mm256_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
3555}
3556
3557static SIMD_INLINE Vec<Word, 32> sll(const Vec<Word, 32> &a,
3558 const uint8_t count)
3559{
3560 return _mm256_sll_epi16(a, _mm_cvtsi32_si128(count));
3561}
3562
3563static SIMD_INLINE Vec<Short, 32> sll(const Vec<Short, 32> &a,
3564 const uint8_t count)
3565{
3566 return _mm256_sll_epi16(a, _mm_cvtsi32_si128(count));
3567}
3568
3569static SIMD_INLINE Vec<Int, 32> sll(const Vec<Int, 32> &a, const uint8_t count)
3570{
3571 return _mm256_sll_epi32(a, _mm_cvtsi32_si128(count));
3572}
3573
3574static SIMD_INLINE Vec<Long, 32> sll(const Vec<Long, 32> &a,
3575 const uint8_t count)
3576{
3577 return _mm256_sll_epi64(a, _mm_cvtsi32_si128(count));
3578}
3579
3580#else
3581
3582// non-avx2 workaround
3583template <typename T>
3584static SIMD_INLINE Vec<T, 32> sll(const Vec<T, 32> &a, const uint8_t count)
3585{
3586 return Vec<T, 32>(sll(a.lo(), count), sll(a.hi(), count));
3587}
3588
3589#endif
3590
3591// 19. Sep 22 (Jonas Keller):
3592// added Byte and SignedByte versions of hadd, hadds, hsub and hsubs
3593// added Word version of hadds and hsubs
3594
3595// ---------------------------------------------------------------------------
3596// hadd
3597// ---------------------------------------------------------------------------
3598
3599template <typename T>
3600static SIMD_INLINE Vec<T, 32> hadd(const Vec<T, 32> &a, const Vec<T, 32> &b)
3601{
3602 Vec<T, 32> x, y;
3603 unzip<1>(a, b, x, y);
3604 return add(x, y);
3605}
3606
3607static SIMD_INLINE Vec<Word, 32> hadd(const Vec<Word, 32> &a,
3608 const Vec<Word, 32> &b)
3609{
3610 return x_mm256_transpose4x64_epi64(x_mm256_hadd_epi16(a, b));
3611}
3612
3613static SIMD_INLINE Vec<Short, 32> hadd(const Vec<Short, 32> &a,
3614 const Vec<Short, 32> &b)
3615{
3616 return x_mm256_transpose4x64_epi64(x_mm256_hadd_epi16(a, b));
3617}
3618
3619static SIMD_INLINE Vec<Int, 32> hadd(const Vec<Int, 32> &a,
3620 const Vec<Int, 32> &b)
3621{
3622 return x_mm256_transpose4x64_epi64(x_mm256_hadd_epi32(a, b));
3623}
3624
3625static SIMD_INLINE Vec<Float, 32> hadd(const Vec<Float, 32> &a,
3626 const Vec<Float, 32> &b)
3627{
3628 return x_mm256_transpose4x64_ps(_mm256_hadd_ps(a, b));
3629}
3630
3631static SIMD_INLINE Vec<Double, 32> hadd(const Vec<Double, 32> &a,
3632 const Vec<Double, 32> &b)
3633{
3634 return x_mm256_transpose4x64_pd(_mm256_hadd_pd(a, b));
3635}
3636
3637// ---------------------------------------------------------------------------
3638// hadds
3639// ---------------------------------------------------------------------------
3640
3641// 09. Mar 23 (Jonas Keller): made Int version of hadds saturating
3642
3643template <typename T>
3644static SIMD_INLINE Vec<T, 32> hadds(const Vec<T, 32> &a, const Vec<T, 32> &b)
3645{
3646 Vec<T, 32> x, y;
3647 unzip<1>(a, b, x, y);
3648 return adds(x, y);
3649}
3650
3651static SIMD_INLINE Vec<Short, 32> hadds(const Vec<Short, 32> &a,
3652 const Vec<Short, 32> &b)
3653{
3654 return x_mm256_transpose4x64_epi64(x_mm256_hadds_epi16(a, b));
3655}
3656
3657// Float not saturated
3658static SIMD_INLINE Vec<Float, 32> hadds(const Vec<Float, 32> &a,
3659 const Vec<Float, 32> &b)
3660{
3661 return x_mm256_transpose4x64_ps(_mm256_hadd_ps(a, b));
3662}
3663
3664// Double not saturated
3665static SIMD_INLINE Vec<Double, 32> hadds(const Vec<Double, 32> &a,
3666 const Vec<Double, 32> &b)
3667{
3668 return x_mm256_transpose4x64_pd(_mm256_hadd_pd(a, b));
3669}
3670
3671// ---------------------------------------------------------------------------
3672// hsub
3673// ---------------------------------------------------------------------------
3674
3675template <typename T>
3676static SIMD_INLINE Vec<T, 32> hsub(const Vec<T, 32> &a, const Vec<T, 32> &b)
3677{
3678 Vec<T, 32> x, y;
3679 unzip<1>(a, b, x, y);
3680 return sub(x, y);
3681}
3682
3683static SIMD_INLINE Vec<Word, 32> hsub(const Vec<Word, 32> &a,
3684 const Vec<Word, 32> &b)
3685{
3686 return x_mm256_transpose4x64_epi64(x_mm256_hsub_epi16(a, b));
3687}
3688
3689static SIMD_INLINE Vec<Short, 32> hsub(const Vec<Short, 32> &a,
3690 const Vec<Short, 32> &b)
3691{
3692 return x_mm256_transpose4x64_epi64(x_mm256_hsub_epi16(a, b));
3693}
3694
3695static SIMD_INLINE Vec<Int, 32> hsub(const Vec<Int, 32> &a,
3696 const Vec<Int, 32> &b)
3697{
3698 return x_mm256_transpose4x64_epi64(x_mm256_hsub_epi32(a, b));
3699}
3700
3701static SIMD_INLINE Vec<Float, 32> hsub(const Vec<Float, 32> &a,
3702 const Vec<Float, 32> &b)
3703{
3704 return x_mm256_transpose4x64_ps(_mm256_hsub_ps(a, b));
3705}
3706
3707static SIMD_INLINE Vec<Double, 32> hsub(const Vec<Double, 32> &a,
3708 const Vec<Double, 32> &b)
3709{
3710 return x_mm256_transpose4x64_pd(_mm256_hsub_pd(a, b));
3711}
3712
3713// ---------------------------------------------------------------------------
3714// hsubs
3715// ---------------------------------------------------------------------------
3716
3717// 09. Mar 23 (Jonas Keller): made Int version of hsubs saturating
3718
3719template <typename T>
3720static SIMD_INLINE Vec<T, 32> hsubs(const Vec<T, 32> &a, const Vec<T, 32> &b)
3721{
3722 Vec<T, 32> x, y;
3723 unzip<1>(a, b, x, y);
3724 return subs(x, y);
3725}
3726
3727static SIMD_INLINE Vec<Short, 32> hsubs(const Vec<Short, 32> &a,
3728 const Vec<Short, 32> &b)
3729{
3730 return x_mm256_transpose4x64_epi64(x_mm256_hsubs_epi16(a, b));
3731}
3732
3733// Float not saturated
3734static SIMD_INLINE Vec<Float, 32> hsubs(const Vec<Float, 32> &a,
3735 const Vec<Float, 32> &b)
3736{
3737 return x_mm256_transpose4x64_ps(_mm256_hsub_ps(a, b));
3738}
3739
3740// Double not saturated
3741static SIMD_INLINE Vec<Double, 32> hsubs(const Vec<Double, 32> &a,
3742 const Vec<Double, 32> &b)
3743{
3744 return x_mm256_transpose4x64_pd(_mm256_hsub_pd(a, b));
3745}
3746
3747// ---------------------------------------------------------------------------
3748// element-wise shift right
3749// ---------------------------------------------------------------------------
3750
3751template <size_t COUNT, typename T>
3752static SIMD_INLINE Vec<T, 32> srle(const Vec<T, 32> &a)
3753{
3754 const __m256i aInt = reinterpret(a, OutputType<Int>());
3755 const Vec<Int, 32> aShifted = x_mm256_srli256_si256<COUNT * sizeof(T)>(aInt);
3756 return reinterpret(aShifted, OutputType<T>());
3757}
3758
3759// ---------------------------------------------------------------------------
3760// element-wise shift left
3761// ---------------------------------------------------------------------------
3762
3763template <size_t COUNT, typename T>
3764static SIMD_INLINE Vec<T, 32> slle(const Vec<T, 32> &a)
3765{
3766 const __m256i aInt = reinterpret(a, OutputType<Int>());
3767 const Vec<Int, 32> aShifted = x_mm256_slli256_si256<COUNT * sizeof(T)>(aInt);
3768 return reinterpret(aShifted, OutputType<T>());
3769}
3770
3771// ---------------------------------------------------------------------------
3772// alignre
3773// ---------------------------------------------------------------------------
3774
3775// all integer versions
3776template <size_t COUNT, typename T>
3777static SIMD_INLINE Vec<T, 32> alignre(const Vec<T, 32> &h, const Vec<T, 32> &l)
3778{
3779 const auto intH = reinterpret(h, OutputType<Int>());
3780 const auto intL = reinterpret(l, OutputType<Int>());
3781 const Vec<Int, 32> intRes =
3782 x_mm256_alignr256_epi8<COUNT * sizeof(T)>(intH, intL);
3783 return reinterpret(intRes, OutputType<T>());
3784}
3785
3786// ---------------------------------------------------------------------------
3787// swizzle
3788// ---------------------------------------------------------------------------
3789
3790// ---------- swizzle aux functions -----------
3791
3792// alignoff is the element-wise offset (relates to size of byte)
3793template <size_t ALIGNOFF>
3794static SIMD_INLINE __m256i align_shuffle_256(__m256i lo, __m256i hi,
3795 __m256i mask)
3796{
3797 static_assert(ALIGNOFF < 32, "");
3798 return x_mm256_shuffle_epi8(x_mm256_alignr_epi8<ALIGNOFF>(hi, lo), mask);
3799}
3800
3801// ---------- swizzle (AoS to SoA) ----------
3802
3803// 01. Apr 23 (Jonas Keller): switched from using tag dispatching to using
3804// enable_if SFINAE, which allows more cases with the same implementation
3805// to be combined
3806
3807// -------------------- n = 1 --------------------
3808
3809// all types
3810template <typename T>
3811static SIMD_INLINE void swizzle(Vec<T, 32>[1], Integer<1>)
3812{
3813 // v remains unchanged
3814}
3815
3816// -------------------- n = 2 --------------------
3817
3818// 8 and 16 bit integer types
3819template <typename T,
3820 SIMD_ENABLE_IF(sizeof(T) <= 2 && std::is_integral<T>::value)>
3821static SIMD_INLINE void swizzle(Vec<T, 32> v[2], Integer<2>)
3822{
3823 Vec<T, 32> vs[2];
3824 swizzle_32_16<2>(v, vs);
3825 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<2, T>());
3826 const __m256i s[2] = {
3827 x_mm256_shuffle_epi8(vs[0], mask),
3828 x_mm256_shuffle_epi8(vs[1], mask),
3829 };
3830 v[0] = x_mm256_unpacklo_epi64(s[0], s[1]);
3831 v[1] = x_mm256_unpackhi_epi64(s[0], s[1]);
3832}
3833
3834// 32 bit types
3835template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3836static SIMD_INLINE void swizzle(Vec<T, 32> v[2], Integer<2>)
3837{
3838 const Vec<Float, 32> vFloat[2] = {
3839 reinterpret(v[0], OutputType<Float>()),
3840 reinterpret(v[1], OutputType<Float>()),
3841 };
3842 Vec<Float, 32> vs[2];
3843 swizzle_32_16<2>(vFloat, vs);
3844 const Vec<Float, 32> vOut[2] = {
3845 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(2, 0, 2, 0)),
3846 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(3, 1, 3, 1)),
3847 };
3848 v[0] = reinterpret(vOut[0], OutputType<T>());
3849 v[1] = reinterpret(vOut[1], OutputType<T>());
3850}
3851
3852// 64 bit types
3853template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3854 typename = void>
3855static SIMD_INLINE void swizzle(Vec<T, 32> v[2], Integer<2>)
3856{
3857 const Vec<Double, 32> vDouble[2] = {
3858 reinterpret(v[0], OutputType<Double>()),
3859 reinterpret(v[1], OutputType<Double>()),
3860 };
3861 Vec<Double, 32> vs[2];
3862 swizzle_32_16<2>(vDouble, vs);
3863 const Vec<Double, 32> vOut[2] = {
3864 _mm256_shuffle_pd(vs[0], vs[1], 0),
3865 _mm256_shuffle_pd(vs[0], vs[1], 0xf),
3866 };
3867 v[0] = reinterpret(vOut[0], OutputType<T>());
3868 v[1] = reinterpret(vOut[1], OutputType<T>());
3869}
3870
3871// -------------------- n = 3 --------------------
3872
3873// 8 and 16 bit integer types
3874template <typename T,
3875 SIMD_ENABLE_IF(sizeof(T) <= 2 && std::is_integral<T>::value)>
3876static SIMD_INLINE void swizzle(Vec<T, 32> v[3], Integer<3>)
3877{
3878 Vec<T, 32> vs[3];
3879 swizzle_32_16<3>(v, vs);
3880 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<3, T>());
3881 const __m256i s0 = align_shuffle_256<0>(vs[0], vs[1], mask);
3882 const __m256i s1 = align_shuffle_256<12>(vs[0], vs[1], mask);
3883 const __m256i s2 = align_shuffle_256<8>(vs[1], vs[2], mask);
3884 const __m256i s3 =
3885 align_shuffle_256<4>(vs[2], _mm256_undefined_si256(), mask);
3886 const __m256i l01 = x_mm256_unpacklo_epi32(s0, s1);
3887 const __m256i h01 = x_mm256_unpackhi_epi32(s0, s1);
3888 const __m256i l23 = x_mm256_unpacklo_epi32(s2, s3);
3889 const __m256i h23 = x_mm256_unpackhi_epi32(s2, s3);
3890 v[0] = x_mm256_unpacklo_epi64(l01, l23);
3891 v[1] = x_mm256_unpackhi_epi64(l01, l23);
3892 v[2] = x_mm256_unpacklo_epi64(h01, h23);
3893}
3894
3895// 32 bit types
3896// from Stan Melax: "3D Vector Normalization..."
3897// https://software.intel.com/en-us/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx
3898template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3899static SIMD_INLINE void swizzle(Vec<T, 32> v[3], Integer<3>)
3900{
3901 const Vec<Float, 32> vFloat[3] = {
3902 reinterpret(v[0], OutputType<Float>()),
3903 reinterpret(v[1], OutputType<Float>()),
3904 reinterpret(v[2], OutputType<Float>()),
3905 };
3906 Vec<Float, 32> vs[3];
3907 swizzle_32_16<3>(vFloat, vs);
3908 // x0y0z0x1 = v[0]
3909 // y1z1x2y2 = v[1]
3910 // z2x3y3z3 = v[2]
3911 const Vec<Float, 32> x2y2x3y3 =
3912 _mm256_shuffle_ps(vs[1], vs[2], _MM_SHUFFLE(2, 1, 3, 2));
3913 const Vec<Float, 32> y0z0y1z1 =
3914 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(1, 0, 2, 1));
3915 const Vec<Float, 32> x0x1x2x3 =
3916 _mm256_shuffle_ps(vs[0], x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
3917 const Vec<Float, 32> y0y1y2y3 =
3918 _mm256_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
3919 const Vec<Float, 32> z0z1z2z3 =
3920 _mm256_shuffle_ps(y0z0y1z1, vs[2], _MM_SHUFFLE(3, 0, 3, 1));
3921 v[0] = reinterpret(x0x1x2x3, OutputType<T>());
3922 v[1] = reinterpret(y0y1y2y3, OutputType<T>());
3923 v[2] = reinterpret(z0z1z2z3, OutputType<T>());
3924}
3925
3926// 64 bit types
3927template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3928 typename = void>
3929static SIMD_INLINE void swizzle(Vec<T, 32> v[3], Integer<3>)
3930{
3931 const Vec<Double, 32> vDouble[3] = {
3932 reinterpret(v[0], OutputType<Double>()), // x0y0z0x1
3933 reinterpret(v[1], OutputType<Double>()), // y1z1x2y2
3934 reinterpret(v[2], OutputType<Double>()), // z2x3y3z3
3935 };
3936 Vec<Double, 32> vs[3];
3937 swizzle_32_16<3>(vDouble, vs);
3938 // vs[0] = x0y0x2y2
3939 // vs[1] = z0x1z2x3
3940 // vs[2] = y1z1y3z3
3941 const Vec<Double, 32> vOut[3] = {
3942 // x0x1x2x3
3943 _mm256_shuffle_pd(vs[0], vs[1], 0xa), // 0b1010
3944 // y0y1y2y3
3945 _mm256_shuffle_pd(vs[0], vs[2], 0x5), // 0b0101
3946 // z0z1z2z3
3947 _mm256_shuffle_pd(vs[1], vs[2], 0xa), // 0b1010
3948 };
3949 v[0] = reinterpret(vOut[0], OutputType<T>());
3950 v[1] = reinterpret(vOut[1], OutputType<T>());
3951 v[2] = reinterpret(vOut[2], OutputType<T>());
3952}
3953
3954// -------------------- n = 4 --------------------
3955
3956// 8 and 16 bit integer types
3957template <typename T,
3958 SIMD_ENABLE_IF((sizeof(T) <= 2 && std::is_integral<T>::value))>
3959static SIMD_INLINE void swizzle(Vec<T, 32> v[4], Integer<4>)
3960{
3961 Vec<T, 32> vs[4];
3962 swizzle_32_16<4>(v, vs);
3963 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<4, T>());
3964 const __m256i s[4] = {
3965 x_mm256_shuffle_epi8(vs[0], mask),
3966 x_mm256_shuffle_epi8(vs[1], mask),
3967 x_mm256_shuffle_epi8(vs[2], mask),
3968 x_mm256_shuffle_epi8(vs[3], mask),
3969 };
3970 const __m256i l01 = x_mm256_unpacklo_epi32(s[0], s[1]);
3971 const __m256i h01 = x_mm256_unpackhi_epi32(s[0], s[1]);
3972 const __m256i l23 = x_mm256_unpacklo_epi32(s[2], s[3]);
3973 const __m256i h23 = x_mm256_unpackhi_epi32(s[2], s[3]);
3974 v[0] = x_mm256_unpacklo_epi64(l01, l23);
3975 v[1] = x_mm256_unpackhi_epi64(l01, l23);
3976 v[2] = x_mm256_unpacklo_epi64(h01, h23);
3977 v[3] = x_mm256_unpackhi_epi64(h01, h23);
3978}
3979
3980// 32 bit types
3981template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3982static SIMD_INLINE void swizzle(Vec<T, 32> v[4], Integer<4>)
3983{
3984 Vec<Float, 32> vInt[4];
3985 for (size_t i = 0; i < 4; ++i) {
3986 vInt[i] = reinterpret(v[i], OutputType<Float>());
3987 }
3988 Vec<Float, 32> vs[4];
3989 swizzle_32_16<4>(vInt, vs);
3990 const __m256 s[4] = {
3991 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(1, 0, 1, 0)),
3992 _mm256_shuffle_ps(vs[0], vs[1], _MM_SHUFFLE(3, 2, 3, 2)),
3993 _mm256_shuffle_ps(vs[2], vs[3], _MM_SHUFFLE(1, 0, 1, 0)),
3994 _mm256_shuffle_ps(vs[2], vs[3], _MM_SHUFFLE(3, 2, 3, 2)),
3995 };
3996 const Vec<Float, 32> vOut[4] = {
3997 _mm256_shuffle_ps(s[0], s[2], _MM_SHUFFLE(2, 0, 2, 0)),
3998 _mm256_shuffle_ps(s[0], s[2], _MM_SHUFFLE(3, 1, 3, 1)),
3999 _mm256_shuffle_ps(s[1], s[3], _MM_SHUFFLE(2, 0, 2, 0)),
4000 _mm256_shuffle_ps(s[1], s[3], _MM_SHUFFLE(3, 1, 3, 1)),
4001 };
4002 for (size_t i = 0; i < 4; ++i) {
4003 v[i] = reinterpret(vOut[i], OutputType<T>());
4004 }
4005}
4006
4007// 64 bit types
4008template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
4009 typename = void>
4010static SIMD_INLINE void swizzle(Vec<T, 32> v[4], Integer<4>)
4011{
4012 const Vec<Double, 32> vInt[4] = {
4013 reinterpret(v[0], OutputType<Double>()), // x0y0z0w0
4014 reinterpret(v[1], OutputType<Double>()), // x1y1z1w1
4015 reinterpret(v[2], OutputType<Double>()), // x2y2z2w2
4016 reinterpret(v[3], OutputType<Double>()), // x3y3z3w3
4017 };
4018 Vec<Double, 32> vs[4];
4019 swizzle_32_16<4>(vInt, vs);
4020 // vs[0] = x0y0x2y2
4021 // vs[1] = z0w0z2w2
4022 // vs[2] = x1y1x3y3
4023 // vs[3] = z1w1z3w3
4024 const Vec<Double, 32> vOut[4] = {
4025 // x0x1x2x3
4026 _mm256_shuffle_pd(vs[0], vs[2], 0x0), // 0b0000
4027 // y0y1y2y3
4028 _mm256_shuffle_pd(vs[0], vs[2], 0xF), // 0b1111
4029 // z0z1z2z3
4030 _mm256_shuffle_pd(vs[1], vs[3], 0x0), // 0b0000
4031 // w0w1w2w3
4032 _mm256_shuffle_pd(vs[1], vs[3], 0xF), // 0b1111
4033 };
4034 for (size_t i = 0; i < 4; ++i) {
4035 v[i] = reinterpret(vOut[i], OutputType<T>());
4036 }
4037}
4038
4039// -------------------- n = 5 --------------------
4040
4041// 8 bit integer types
4042template <typename T,
4043 SIMD_ENABLE_IF(sizeof(T) == 1 && std::is_integral<T>::value)>
4044static SIMD_INLINE void swizzle(Vec<T, 32> v[5], Integer<5>)
4045{
4046 Vec<T, 32> vs[5];
4047 swizzle_32_16<5>(v, vs);
4048 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<5, T>());
4049 const __m256i s0 = align_shuffle_256<0>(vs[0], vs[1], mask);
4050 const __m256i s1 = align_shuffle_256<10>(vs[0], vs[1], mask);
4051 const __m256i s2 = align_shuffle_256<4>(vs[1], vs[2], mask);
4052 const __m256i s3 = align_shuffle_256<14>(vs[1], vs[2], mask);
4053 const __m256i s4 = align_shuffle_256<8>(vs[2], vs[3], mask);
4054 const __m256i s5 = align_shuffle_256<2>(vs[3], vs[4], mask);
4055 const __m256i s6 = align_shuffle_256<12>(vs[3], vs[4], mask);
4056 const __m256i s7 =
4057 align_shuffle_256<6>(vs[4], _mm256_undefined_si256(), mask);
4058 const __m256i l01 = x_mm256_unpacklo_epi16(s0, s1);
4059 const __m256i h01 = x_mm256_unpackhi_epi16(s0, s1);
4060 const __m256i l23 = x_mm256_unpacklo_epi16(s2, s3);
4061 const __m256i h23 = x_mm256_unpackhi_epi16(s2, s3);
4062 const __m256i l45 = x_mm256_unpacklo_epi16(s4, s5);
4063 const __m256i h45 = x_mm256_unpackhi_epi16(s4, s5);
4064 const __m256i l67 = x_mm256_unpacklo_epi16(s6, s7);
4065 const __m256i h67 = x_mm256_unpackhi_epi16(s6, s7);
4066 const __m256i ll01l23 = x_mm256_unpacklo_epi32(l01, l23);
4067 const __m256i hl01l23 = x_mm256_unpackhi_epi32(l01, l23);
4068 const __m256i ll45l67 = x_mm256_unpacklo_epi32(l45, l67);
4069 const __m256i hl45l67 = x_mm256_unpackhi_epi32(l45, l67);
4070 const __m256i lh01h23 = x_mm256_unpacklo_epi32(h01, h23);
4071 const __m256i lh45h67 = x_mm256_unpacklo_epi32(h45, h67);
4072 v[0] = x_mm256_unpacklo_epi64(ll01l23, ll45l67);
4073 v[1] = x_mm256_unpackhi_epi64(ll01l23, ll45l67);
4074 v[2] = x_mm256_unpacklo_epi64(hl01l23, hl45l67);
4075 v[3] = x_mm256_unpackhi_epi64(hl01l23, hl45l67);
4076 v[4] = x_mm256_unpacklo_epi64(lh01h23, lh45h67);
4077}
4078
4079// 16 bit integer types
4080template <typename T,
4081 SIMD_ENABLE_IF(sizeof(T) == 2 && std::is_integral<T>::value),
4082 typename = void>
4083static SIMD_INLINE void swizzle(Vec<T, 32> v[5], Integer<5>)
4084{
4085 Vec<T, 32> vs[5];
4086 swizzle_32_16<5>(v, vs);
4087 const __m256i mask = x_mm256_duplicate_si128(get_swizzle_mask<5, T>());
4088 const __m256i s0 = align_shuffle_256<0>(vs[0], vs[1], mask);
4089 const __m256i s1 = align_shuffle_256<6>(vs[0], vs[1], mask);
4090 const __m256i s2 = align_shuffle_256<4>(vs[1], vs[2], mask);
4091 const __m256i s3 = align_shuffle_256<10>(vs[1], vs[2], mask);
4092 const __m256i s4 = align_shuffle_256<8>(vs[2], vs[3], mask);
4093 const __m256i s5 = align_shuffle_256<14>(vs[2], vs[3], mask);
4094 const __m256i s6 = align_shuffle_256<12>(vs[3], vs[4], mask);
4095 const __m256i s7 =
4096 align_shuffle_256<2>(vs[4], _mm256_undefined_si256(), mask);
4097 const __m256i l02 = x_mm256_unpacklo_epi32(s0, s2);
4098 const __m256i h02 = x_mm256_unpackhi_epi32(s0, s2);
4099 const __m256i l13 = x_mm256_unpacklo_epi32(s1, s3);
4100 const __m256i l46 = x_mm256_unpacklo_epi32(s4, s6);
4101 const __m256i h46 = x_mm256_unpackhi_epi32(s4, s6);
4102 const __m256i l57 = x_mm256_unpacklo_epi32(s5, s7);
4103 v[0] = x_mm256_unpacklo_epi64(l02, l46);
4104 v[1] = x_mm256_unpackhi_epi64(l02, l46);
4105 v[2] = x_mm256_unpacklo_epi64(h02, h46);
4106 v[3] = x_mm256_unpacklo_epi64(l13, l57);
4107 v[4] = x_mm256_unpackhi_epi64(l13, l57);
4108}
4109
4110// 32 bit types
4111template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void,
4112 typename = void>
4113static SIMD_INLINE void swizzle(Vec<T, 32> v[5], Integer<5>)
4114{
4115 Vec<Int, 32> vInt[5];
4116 for (size_t i = 0; i < 5; i++) {
4117 vInt[i] = reinterpret(v[i], OutputType<Int>());
4118 }
4119 Vec<Int, 32> vs[5];
4120 swizzle_32_16<5>(vInt, vs);
4121 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4122 // v[0]: 0 1 2 3
4123 // v[1]: 4 x x x
4124 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4125 // x x x x
4126 // 5 6 7 8
4127 const __m256i s2 = x_mm256_alignr_epi8<4>(vs[2], vs[1]);
4128 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4129 // x x x x
4130 // 9 x x x
4131 const __m256i s3 = x_mm256_alignr_epi8<4>(vs[3], vs[2]);
4132 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4133 // x x x x
4134 // 10 11 12 13
4135 const __m256i s4 = x_mm256_alignr_epi8<8>(vs[3], vs[2]);
4136 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4137 // x x x x
4138 // 14 x x x
4139 const __m256i s5 = x_mm256_alignr_epi8<8>(vs[4], vs[3]);
4140 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4141 // X X X X
4142 // 15 16 17 18
4143 const __m256i s6 = x_mm256_alignr_epi8<12>(vs[4], vs[3]);
4144 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
4145 // X X X X
4146 // 19 x x x
4147 const __m256i s7 = x_mm256_alignr_epi8<12>(vs[0], vs[4]);
4148 // 0 1 2 3 / 5 6 7 8 -> 0 5 1 6 / 2 7 3 8
4149 const __m256i l02 = x_mm256_unpacklo_epi32(vs[0], s2);
4150 const __m256i h02 = x_mm256_unpackhi_epi32(vs[0], s2);
4151 // 4 x x x / 9 x x x -> 4 9 x x
4152 const __m256i l13 = x_mm256_unpacklo_epi32(vs[1], s3);
4153 // 10 11 12 13 / 15 16 17 18 -> 10 15 11 13 / 12 17 13 18
4154 const __m256i l46 = x_mm256_unpacklo_epi32(s4, s6);
4155 const __m256i h46 = x_mm256_unpackhi_epi32(s4, s6);
4156 // 14 x x x / 19 x x x -> 14 19 x x
4157 const __m256i l57 = x_mm256_unpacklo_epi32(s5, s7);
4158 const Vec<Int, 32> vOut[5] = {
4159 // 0 5 1 6 / 10 15 11 13 -> 0 5 10 15 / 1 6 11 16
4160 x_mm256_unpacklo_epi64(l02, l46),
4161 x_mm256_unpackhi_epi64(l02, l46),
4162 // 2 7 3 8 / 12 17 13 18 -> 2 7 12 17 / 3 8 13 18
4163 x_mm256_unpacklo_epi64(h02, h46),
4164 x_mm256_unpackhi_epi64(h02, h46),
4165 // 4 9 x x / 14 19 x x -> 4 9 14 19
4166 x_mm256_unpacklo_epi64(l13, l57),
4167 };
4168 for (size_t i = 0; i < 5; i++) {
4169 v[i] = reinterpret(vOut[i], OutputType<T>());
4170 }
4171}
4172
4173// 64 bit types
4174template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
4175 typename = void, typename = void>
4176static SIMD_INLINE void swizzle(Vec<T, 32> v[5], Integer<5>)
4177{
4178 const Vec<Double, 32> vDouble[5] = {
4179 reinterpret(v[0], OutputType<Double>()), // x0y0z0w0
4180 reinterpret(v[1], OutputType<Double>()), // v0x1y1z1
4181 reinterpret(v[2], OutputType<Double>()), // w1v1x2y2
4182 reinterpret(v[3], OutputType<Double>()), // z2w2v2x3
4183 reinterpret(v[4], OutputType<Double>()), // y3z3w3v3
4184 };
4185 Vec<Double, 32> vs[5];
4186 swizzle_32_16<5>(vDouble, vs);
4187 // vs[0] = x0y0x2y2
4188 // vs[1] = z0w0z2w2
4189 // vs[2] = v0x1v2x3
4190 // vs[3] = y1z1y3z3
4191 // vs[4] = w1v1w3v3
4192 const Vec<Double, 32> vOut[5] = {
4193 // x0x1x2x3
4194 _mm256_shuffle_pd(vs[0], vs[2], 0xa), // 0b1010
4195 // y0y1y2y3
4196 _mm256_shuffle_pd(vs[0], vs[3], 0x5), // 0b0101
4197 // z0z1z2z3
4198 _mm256_shuffle_pd(vs[1], vs[3], 0xa), // 0b1010
4199 // w0w1w2w3
4200 _mm256_shuffle_pd(vs[1], vs[4], 0x5), // 0b0101
4201 // v0v1v2v3
4202 _mm256_shuffle_pd(vs[2], vs[4], 0xa), // 0b1010
4203 };
4204 for (size_t i = 0; i < 5; i++) {
4205 v[i] = reinterpret(vOut[i], OutputType<T>());
4206 }
4207}
4208
4209// ---------------------------------------------------------------------------
4210// comparison functions
4211// ---------------------------------------------------------------------------
4212
4213// 28. Mar 23 (Jonas Keller): checked the constants for _mm256_cmp_ps in the
4214// Float comparison functions, they match the implementation of the SSE versions
4215// (see cmpps in Intel manual) and added corresponding comments
4216
4217// ---------------------------------------------------------------------------
4218// compare <
4219// ---------------------------------------------------------------------------
4220
4221// http://stackoverflow.com/questions/16204663/
4222// sse-compare-packed-unsigned-bytes
4223
4224#ifdef __AVX2__
4225
4226static SIMD_INLINE Vec<Byte, 32> cmplt(const Vec<Byte, 32> &a,
4227 const Vec<Byte, 32> &b)
4228{
4229 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4230 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x80
4231 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x80
4232 return _mm256_cmpgt_epi8(b1, a1);
4233}
4234
4235static SIMD_INLINE Vec<SignedByte, 32> cmplt(const Vec<SignedByte, 32> &a,
4236 const Vec<SignedByte, 32> &b)
4237{
4238 return _mm256_cmpgt_epi8(b, a);
4239}
4240
4241static SIMD_INLINE Vec<Word, 32> cmplt(const Vec<Word, 32> &a,
4242 const Vec<Word, 32> &b)
4243{
4244 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4245 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x8000
4246 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x8000
4247 return _mm256_cmpgt_epi16(b1, a1);
4248}
4249
4250static SIMD_INLINE Vec<Short, 32> cmplt(const Vec<Short, 32> &a,
4251 const Vec<Short, 32> &b)
4252{
4253 return _mm256_cmpgt_epi16(b, a);
4254}
4255
4256static SIMD_INLINE Vec<Int, 32> cmplt(const Vec<Int, 32> &a,
4257 const Vec<Int, 32> &b)
4258{
4259 return _mm256_cmpgt_epi32(b, a);
4260}
4261
4262static SIMD_INLINE Vec<Long, 32> cmplt(const Vec<Long, 32> &a,
4263 const Vec<Long, 32> &b)
4264{
4265 return _mm256_cmpgt_epi64(b, a);
4266}
4267
4268#else
4269
4270// non-avx2 workaround
4271template <typename T>
4272static SIMD_INLINE Vec<T, 32> cmplt(const Vec<T, 32> &a, const Vec<T, 32> &b)
4273{
4274 return Vec<T, 32>(cmplt(a.lo(), b.lo()), cmplt(a.hi(), b.hi()));
4275}
4276
4277#endif
4278
4279static SIMD_INLINE Vec<Float, 32> cmplt(const Vec<Float, 32> &a,
4280 const Vec<Float, 32> &b)
4281{
4282 // same constant as in implementation of _mm_cmplt_ps (see cmpps instruction
4283 // in Intel manual)
4284 return _mm256_cmp_ps(a, b, _CMP_LT_OS);
4285}
4286
4287static SIMD_INLINE Vec<Double, 32> cmplt(const Vec<Double, 32> &a,
4288 const Vec<Double, 32> &b)
4289{
4290 return _mm256_cmp_pd(a, b, _CMP_LT_OS);
4291}
4292
4293// ---------------------------------------------------------------------------
4294// compare <=
4295// ---------------------------------------------------------------------------
4296
4297// http://stackoverflow.com/questions/16204663/
4298// sse-compare-packed-unsigned-bytes
4299
4300#ifdef __AVX2__
4301
4302static SIMD_INLINE Vec<Byte, 32> cmple(const Vec<Byte, 32> &a,
4303 const Vec<Byte, 32> &b)
4304{
4305 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4306 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x80
4307 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x80
4308 return _mm256_or_si256(_mm256_cmpgt_epi8(b1, a1), _mm256_cmpeq_epi8(a1, b1));
4309}
4310
4311static SIMD_INLINE Vec<SignedByte, 32> cmple(const Vec<SignedByte, 32> &a,
4312 const Vec<SignedByte, 32> &b)
4313{
4314 return _mm256_or_si256(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(a, b));
4315}
4316
4317static SIMD_INLINE Vec<Word, 32> cmple(const Vec<Word, 32> &a,
4318 const Vec<Word, 32> &b)
4319{
4320 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4321 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x8000
4322 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x8000
4323 return _mm256_or_si256(_mm256_cmpgt_epi16(b1, a1),
4324 _mm256_cmpeq_epi16(a1, b1));
4325}
4326
4327static SIMD_INLINE Vec<Short, 32> cmple(const Vec<Short, 32> &a,
4328 const Vec<Short, 32> &b)
4329{
4330 return _mm256_or_si256(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(a, b));
4331}
4332
4333static SIMD_INLINE Vec<Int, 32> cmple(const Vec<Int, 32> &a,
4334 const Vec<Int, 32> &b)
4335{
4336 return _mm256_or_si256(_mm256_cmpgt_epi32(b, a), _mm256_cmpeq_epi32(a, b));
4337}
4338
4339static SIMD_INLINE Vec<Long, 32> cmple(const Vec<Long, 32> &a,
4340 const Vec<Long, 32> &b)
4341{
4342 return _mm256_or_si256(_mm256_cmpgt_epi64(b, a), _mm256_cmpeq_epi64(a, b));
4343}
4344
4345#else
4346
4347// non-avx2 workaround
4348template <typename T>
4349static SIMD_INLINE Vec<T, 32> cmple(const Vec<T, 32> &a, const Vec<T, 32> &b)
4350{
4351 return Vec<T, 32>(cmple(a.lo(), b.lo()), cmple(a.hi(), b.hi()));
4352}
4353
4354#endif
4355
4356static SIMD_INLINE Vec<Float, 32> cmple(const Vec<Float, 32> &a,
4357 const Vec<Float, 32> &b)
4358{
4359 // same constant as in implementation of _mm_cmple_ps (see cmpps instruction
4360 // in Intel manual)
4361 return _mm256_cmp_ps(a, b, _CMP_LE_OS);
4362}
4363
4364static SIMD_INLINE Vec<Double, 32> cmple(const Vec<Double, 32> &a,
4365 const Vec<Double, 32> &b)
4366{
4367 return _mm256_cmp_pd(a, b, _CMP_LE_OS);
4368}
4369
4370// ---------------------------------------------------------------------------
4371// compare ==
4372// ---------------------------------------------------------------------------
4373
4374#ifdef __AVX2__
4375
4376static SIMD_INLINE Vec<Byte, 32> cmpeq(const Vec<Byte, 32> &a,
4377 const Vec<Byte, 32> &b)
4378{
4379 return _mm256_cmpeq_epi8(a, b);
4380}
4381
4382static SIMD_INLINE Vec<SignedByte, 32> cmpeq(const Vec<SignedByte, 32> &a,
4383 const Vec<SignedByte, 32> &b)
4384{
4385 return _mm256_cmpeq_epi8(a, b);
4386}
4387
4388static SIMD_INLINE Vec<Word, 32> cmpeq(const Vec<Word, 32> &a,
4389 const Vec<Word, 32> &b)
4390{
4391 return _mm256_cmpeq_epi16(a, b);
4392}
4393
4394static SIMD_INLINE Vec<Short, 32> cmpeq(const Vec<Short, 32> &a,
4395 const Vec<Short, 32> &b)
4396{
4397 return _mm256_cmpeq_epi16(a, b);
4398}
4399
4400static SIMD_INLINE Vec<Int, 32> cmpeq(const Vec<Int, 32> &a,
4401 const Vec<Int, 32> &b)
4402{
4403 return _mm256_cmpeq_epi32(a, b);
4404}
4405
4406static SIMD_INLINE Vec<Long, 32> cmpeq(const Vec<Long, 32> &a,
4407 const Vec<Long, 32> &b)
4408{
4409 return _mm256_cmpeq_epi64(a, b);
4410}
4411
4412#else
4413
4414// non-avx2 workaround
4415template <typename T>
4416static SIMD_INLINE Vec<T, 32> cmpeq(const Vec<T, 32> &a, const Vec<T, 32> &b)
4417{
4418 return Vec<T, 32>(cmpeq(a.lo(), b.lo()), cmpeq(a.hi(), b.hi()));
4419}
4420
4421#endif
4422
4423static SIMD_INLINE Vec<Float, 32> cmpeq(const Vec<Float, 32> &a,
4424 const Vec<Float, 32> &b)
4425{
4426 // same constant as in implementation of _mm_cmpeq_ps (see cmpps instruction
4427 // in Intel manual)
4428 return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
4429}
4430
4431static SIMD_INLINE Vec<Double, 32> cmpeq(const Vec<Double, 32> &a,
4432 const Vec<Double, 32> &b)
4433{
4434 return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
4435}
4436
4437// ---------------------------------------------------------------------------
4438// compare >
4439// ---------------------------------------------------------------------------
4440
4441// http://stackoverflow.com/questions/16204663/
4442// sse-compare-packed-unsigned-bytes
4443
4444#ifdef __AVX2__
4445
4446static SIMD_INLINE Vec<Byte, 32> cmpgt(const Vec<Byte, 32> &a,
4447 const Vec<Byte, 32> &b)
4448{
4449 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4450 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x80
4451 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x80
4452 return _mm256_cmpgt_epi8(a1, b1);
4453}
4454
4455static SIMD_INLINE Vec<SignedByte, 32> cmpgt(const Vec<SignedByte, 32> &a,
4456 const Vec<SignedByte, 32> &b)
4457{
4458 return _mm256_cmpgt_epi8(a, b);
4459}
4460
4461static SIMD_INLINE Vec<Word, 32> cmpgt(const Vec<Word, 32> &a,
4462 const Vec<Word, 32> &b)
4463{
4464 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4465 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x8000
4466 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x8000
4467 return _mm256_cmpgt_epi16(a1, b1);
4468}
4469
4470static SIMD_INLINE Vec<Short, 32> cmpgt(const Vec<Short, 32> &a,
4471 const Vec<Short, 32> &b)
4472{
4473 return _mm256_cmpgt_epi16(a, b);
4474}
4475
4476static SIMD_INLINE Vec<Int, 32> cmpgt(const Vec<Int, 32> &a,
4477 const Vec<Int, 32> &b)
4478{
4479 return _mm256_cmpgt_epi32(a, b);
4480}
4481
4482static SIMD_INLINE Vec<Long, 32> cmpgt(const Vec<Long, 32> &a,
4483 const Vec<Long, 32> &b)
4484{
4485 return _mm256_cmpgt_epi64(a, b);
4486}
4487
4488#else
4489
4490// non-avx2 workaround
4491template <typename T>
4492static SIMD_INLINE Vec<T, 32> cmpgt(const Vec<T, 32> &a, const Vec<T, 32> &b)
4493{
4494 return Vec<T, 32>(cmpgt(a.lo(), b.lo()), cmpgt(a.hi(), b.hi()));
4495}
4496
4497#endif
4498
4499static SIMD_INLINE Vec<Float, 32> cmpgt(const Vec<Float, 32> &a,
4500 const Vec<Float, 32> &b)
4501{
4502 // same constant as in implementation of _mm_cmplt_ps (see cmpps instruction
4503 // in Intel manual)
4504 return _mm256_cmp_ps(b, a, _CMP_LT_OS);
4505}
4506
4507static SIMD_INLINE Vec<Double, 32> cmpgt(const Vec<Double, 32> &a,
4508 const Vec<Double, 32> &b)
4509{
4510 return _mm256_cmp_pd(b, a, _CMP_LT_OS);
4511}
4512
4513// ---------------------------------------------------------------------------
4514// compare >=
4515// ---------------------------------------------------------------------------
4516
4517// http://stackoverflow.com/questions/16204663/
4518// sse-compare-packed-unsigned-bytes
4519
4520#ifdef __AVX2__
4521
4522static SIMD_INLINE Vec<Byte, 32> cmpge(const Vec<Byte, 32> &a,
4523 const Vec<Byte, 32> &b)
4524{
4525 const __m256i signbit = _mm256_set1_epi32(0x80808080);
4526 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x80
4527 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x80
4528 return _mm256_or_si256(_mm256_cmpgt_epi8(a1, b1), _mm256_cmpeq_epi8(a1, b1));
4529}
4530
4531static SIMD_INLINE Vec<SignedByte, 32> cmpge(const Vec<SignedByte, 32> &a,
4532 const Vec<SignedByte, 32> &b)
4533{
4534 return _mm256_or_si256(_mm256_cmpgt_epi8(a, b), _mm256_cmpeq_epi8(a, b));
4535}
4536
4537static SIMD_INLINE Vec<Word, 32> cmpge(const Vec<Word, 32> &a,
4538 const Vec<Word, 32> &b)
4539{
4540 const __m256i signbit = _mm256_set1_epi32(0x80008000);
4541 const __m256i a1 = _mm256_xor_si256(a, signbit); // sub 0x8000
4542 const __m256i b1 = _mm256_xor_si256(b, signbit); // sub 0x8000
4543 return _mm256_or_si256(_mm256_cmpgt_epi16(a1, b1),
4544 _mm256_cmpeq_epi16(a1, b1));
4545}
4546
4547static SIMD_INLINE Vec<Short, 32> cmpge(const Vec<Short, 32> &a,
4548 const Vec<Short, 32> &b)
4549{
4550 return _mm256_or_si256(_mm256_cmpgt_epi16(a, b), _mm256_cmpeq_epi16(a, b));
4551}
4552
4553static SIMD_INLINE Vec<Int, 32> cmpge(const Vec<Int, 32> &a,
4554 const Vec<Int, 32> &b)
4555{
4556 return _mm256_or_si256(_mm256_cmpgt_epi32(a, b), _mm256_cmpeq_epi32(a, b));
4557}
4558
4559static SIMD_INLINE Vec<Long, 32> cmpge(const Vec<Long, 32> &a,
4560 const Vec<Long, 32> &b)
4561{
4562 return _mm256_or_si256(_mm256_cmpgt_epi64(a, b), _mm256_cmpeq_epi64(a, b));
4563}
4564
4565#else
4566
4567// non-avx2 workaround
4568template <typename T>
4569static SIMD_INLINE Vec<T, 32> cmpge(const Vec<T, 32> &a, const Vec<T, 32> &b)
4570{
4571 return Vec<T, 32>(cmpge(a.lo(), b.lo()), cmpge(a.hi(), b.hi()));
4572}
4573
4574#endif
4575
4576static SIMD_INLINE Vec<Float, 32> cmpge(const Vec<Float, 32> &a,
4577 const Vec<Float, 32> &b)
4578{
4579 // same constant as in implementation of _mm_cmple_ps (see cmpps instruction
4580 // in Intel manual)
4581 return _mm256_cmp_ps(b, a, _CMP_LE_OS);
4582}
4583
4584static SIMD_INLINE Vec<Double, 32> cmpge(const Vec<Double, 32> &a,
4585 const Vec<Double, 32> &b)
4586{
4587 return _mm256_cmp_pd(b, a, _CMP_LE_OS);
4588}
4589
4590// ---------------------------------------------------------------------------
4591// compare !=
4592// ---------------------------------------------------------------------------
4593
4594#ifdef __AVX2__
4595
4596// there is no cmpneq for integers and no not, so use cmpeq and xor with all
4597// ones to invert the result
4598
4599static SIMD_INLINE Vec<Byte, 32> cmpneq(const Vec<Byte, 32> &a,
4600 const Vec<Byte, 32> &b)
4601{
4602 return _mm256_xor_si256(_mm256_cmpeq_epi8(a, b), _mm256_set1_epi32(-1));
4603}
4604
4605static SIMD_INLINE Vec<SignedByte, 32> cmpneq(const Vec<SignedByte, 32> &a,
4606 const Vec<SignedByte, 32> &b)
4607{
4608 return _mm256_xor_si256(_mm256_cmpeq_epi8(a, b), _mm256_set1_epi32(-1));
4609}
4610
4611static SIMD_INLINE Vec<Word, 32> cmpneq(const Vec<Word, 32> &a,
4612 const Vec<Word, 32> &b)
4613{
4614 return _mm256_xor_si256(_mm256_cmpeq_epi16(a, b), _mm256_set1_epi32(-1));
4615}
4616
4617static SIMD_INLINE Vec<Short, 32> cmpneq(const Vec<Short, 32> &a,
4618 const Vec<Short, 32> &b)
4619{
4620 return _mm256_xor_si256(_mm256_cmpeq_epi16(a, b), _mm256_set1_epi32(-1));
4621}
4622
4623static SIMD_INLINE Vec<Int, 32> cmpneq(const Vec<Int, 32> &a,
4624 const Vec<Int, 32> &b)
4625{
4626 return _mm256_xor_si256(_mm256_cmpeq_epi32(a, b), _mm256_set1_epi32(-1));
4627}
4628
4629static SIMD_INLINE Vec<Long, 32> cmpneq(const Vec<Long, 32> &a,
4630 const Vec<Long, 32> &b)
4631{
4632 return _mm256_xor_si256(_mm256_cmpeq_epi64(a, b), _mm256_set1_epi32(-1));
4633}
4634
4635#else
4636
4637// non-avx2 workaround
4638template <typename T>
4639static SIMD_INLINE Vec<T, 32> cmpneq(const Vec<T, 32> &a, const Vec<T, 32> &b)
4640{
4641 return Vec<T, 32>(cmpneq(a.lo(), b.lo()), cmpneq(a.hi(), b.hi()));
4642}
4643
4644#endif
4645
4646static SIMD_INLINE Vec<Float, 32> cmpneq(const Vec<Float, 32> &a,
4647 const Vec<Float, 32> &b)
4648{
4649 // same constant as in implementation of _mm_cmpneq_ps (see cmpps instruction
4650 // in Intel manual)
4651 return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ);
4652}
4653
4654static SIMD_INLINE Vec<Double, 32> cmpneq(const Vec<Double, 32> &a,
4655 const Vec<Double, 32> &b)
4656{
4657 return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ);
4658}
4659
4660// ---------------------------------------------------------------------------
4661// ifelse
4662// ---------------------------------------------------------------------------
4663
4664// 10. Apr 23 (Jonas Keller): made two versions of ifelse, one for 8 and 16 bit
4665// data types, and one for 32 and larger data types, so that for the latter
4666// the blendv instruction can be used even if avx2 is not available
4667
4668// version for 8 and 16 bit data types
4669template <typename T, SIMD_ENABLE_IF(sizeof(T) <= 2)>
4670static SIMD_INLINE Vec<T, 32> ifelse(const Vec<T, 32> &cond,
4671 const Vec<T, 32> &trueVal,
4672 const Vec<T, 32> &falseVal)
4673{
4674#ifdef __AVX2__
4675 const Vec<Byte, 32> res =
4676 _mm256_blendv_epi8(reinterpret(falseVal, OutputType<Byte>()),
4677 reinterpret(trueVal, OutputType<Byte>()),
4678 reinterpret(cond, OutputType<Byte>()));
4679#else
4680 // non-avx2 workaround
4681 const Vec<Float, 32> res =
4682 _mm256_or_ps(_mm256_and_ps(reinterpret(cond, OutputType<Float>()),
4683 reinterpret(trueVal, OutputType<Float>())),
4684 _mm256_andnot_ps(reinterpret(cond, OutputType<Float>()),
4685 reinterpret(falseVal, OutputType<Float>())));
4686#endif
4687 return reinterpret(res, OutputType<T>());
4688}
4689
4690// version for 32 bit or larger data types
4691template <typename T, SIMD_ENABLE_IF(sizeof(T) > 2), typename = void>
4692static SIMD_INLINE Vec<T, 32> ifelse(const Vec<T, 32> &cond,
4693 const Vec<T, 32> &trueVal,
4694 const Vec<T, 32> &falseVal)
4695{
4696 const Vec<Float, 32> res =
4697 _mm256_blendv_ps(reinterpret(falseVal, OutputType<Float>()),
4698 reinterpret(trueVal, OutputType<Float>()),
4699 reinterpret(cond, OutputType<Float>()));
4700 return reinterpret(res, OutputType<T>());
4701}
4702
4703// ---------------------------------------------------------------------------
4704// bit_and
4705// ---------------------------------------------------------------------------
4706
4707// all integer versions
4708template <typename T>
4709static SIMD_INLINE Vec<T, 32> bit_and(const Vec<T, 32> &a, const Vec<T, 32> &b)
4710{
4711#ifdef __AVX2__
4712 return _mm256_and_si256(a, b);
4713#else
4714 // non-avx2 workaround
4715 return _mm256_castps_si256(
4716 _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4717#endif
4718}
4719
4720// float version
4721static SIMD_INLINE Vec<Float, 32> bit_and(const Vec<Float, 32> &a,
4722 const Vec<Float, 32> &b)
4723{
4724 return _mm256_and_ps(a, b);
4725}
4726
4727// double version
4728static SIMD_INLINE Vec<Double, 32> bit_and(const Vec<Double, 32> &a,
4729 const Vec<Double, 32> &b)
4730{
4731 return _mm256_and_pd(a, b);
4732}
4733
4734// ---------------------------------------------------------------------------
4735// bit_or
4736// ---------------------------------------------------------------------------
4737
4738// all integer versions
4739template <typename T>
4740static SIMD_INLINE Vec<T, 32> bit_or(const Vec<T, 32> &a, const Vec<T, 32> &b)
4741{
4742#ifdef __AVX2__
4743 return _mm256_or_si256(a, b);
4744#else
4745 // non-avx2 workaround
4746 return _mm256_castps_si256(
4747 _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4748#endif
4749}
4750
4751// float version
4752static SIMD_INLINE Vec<Float, 32> bit_or(const Vec<Float, 32> &a,
4753 const Vec<Float, 32> &b)
4754{
4755 return _mm256_or_ps(a, b);
4756}
4757
4758// double version
4759static SIMD_INLINE Vec<Double, 32> bit_or(const Vec<Double, 32> &a,
4760 const Vec<Double, 32> &b)
4761{
4762 return _mm256_or_pd(a, b);
4763}
4764
4765// ---------------------------------------------------------------------------
4766// bit_andnot
4767// ---------------------------------------------------------------------------
4768
4769// all integer versions
4770template <typename T>
4771static SIMD_INLINE Vec<T, 32> bit_andnot(const Vec<T, 32> &a,
4772 const Vec<T, 32> &b)
4773{
4774#ifdef __AVX2__
4775 return _mm256_andnot_si256(a, b);
4776#else
4777 // non-avx2 workaround
4778 return _mm256_castps_si256(
4779 _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4780#endif
4781}
4782
4783// float version
4784static SIMD_INLINE Vec<Float, 32> bit_andnot(const Vec<Float, 32> &a,
4785 const Vec<Float, 32> &b)
4786{
4787 return _mm256_andnot_ps(a, b);
4788}
4789
4790// double version
4791static SIMD_INLINE Vec<Double, 32> bit_andnot(const Vec<Double, 32> &a,
4792 const Vec<Double, 32> &b)
4793{
4794 return _mm256_andnot_pd(a, b);
4795}
4796
4797// ---------------------------------------------------------------------------
4798// bit_xor
4799// ---------------------------------------------------------------------------
4800
4801// all integer versions
4802template <typename T>
4803static SIMD_INLINE Vec<T, 32> bit_xor(const Vec<T, 32> &a, const Vec<T, 32> &b)
4804{
4805#ifdef __AVX2__
4806 return _mm256_xor_si256(a, b);
4807#else
4808 // non-avx2 workaround
4809 return _mm256_castps_si256(
4810 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
4811#endif
4812}
4813
4814// float version
4815static SIMD_INLINE Vec<Float, 32> bit_xor(const Vec<Float, 32> &a,
4816 const Vec<Float, 32> &b)
4817{
4818 return _mm256_xor_ps(a, b);
4819}
4820
4821// double version
4822static SIMD_INLINE Vec<Double, 32> bit_xor(const Vec<Double, 32> &a,
4823 const Vec<Double, 32> &b)
4824{
4825 return _mm256_xor_pd(a, b);
4826}
4827
4828// ---------------------------------------------------------------------------
4829// bit_not
4830// ---------------------------------------------------------------------------
4831
4832// all integer versions
4833template <typename T>
4834static SIMD_INLINE Vec<T, 32> bit_not(const Vec<T, 32> &a)
4835{
4836#ifdef __AVX2__
4837 return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
4838#else
4839 // non-avx2 workaround
4840 return _mm256_castps_si256(_mm256_xor_ps(
4841 _mm256_castsi256_ps(a), _mm256_castsi256_ps(_mm256_set1_epi32(-1))));
4842#endif
4843}
4844
4845// float version
4846static SIMD_INLINE Vec<Float, 32> bit_not(const Vec<Float, 32> &a)
4847{
4848 return _mm256_xor_ps(a, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
4849}
4850
4851// double version
4852static SIMD_INLINE Vec<Double, 32> bit_not(const Vec<Double, 32> &a)
4853{
4854 return _mm256_xor_pd(a, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
4855}
4856
4857// ---------------------------------------------------------------------------
4858// avg: average with rounding down
4859// ---------------------------------------------------------------------------
4860
4861#ifdef __AVX2__
4862
4863static SIMD_INLINE Vec<Byte, 32> avg(const Vec<Byte, 32> &a,
4864 const Vec<Byte, 32> &b)
4865{
4866 return _mm256_avg_epu8(a, b);
4867}
4868
4869// Paul R at
4870// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4871static SIMD_INLINE Vec<SignedByte, 32> avg(const Vec<SignedByte, 32> &a,
4872 const Vec<SignedByte, 32> &b)
4873{
4874 // from Agner Fog's VCL vectori128.h
4875 const __m256i signbit = _mm256_set1_epi8(int8_t(0x80));
4876 const __m256i a1 = _mm256_xor_si256(a, signbit); // add 0x80
4877 const __m256i b1 = _mm256_xor_si256(b, signbit); // add 0x80
4878 const __m256i m1 = _mm256_avg_epu8(a1, b1); // unsigned avg
4879 return _mm256_xor_si256(m1, signbit); // sub 0x80
4880}
4881
4882static SIMD_INLINE Vec<Word, 32> avg(const Vec<Word, 32> &a,
4883 const Vec<Word, 32> &b)
4884{
4885 return _mm256_avg_epu16(a, b);
4886}
4887
4888// Paul R at
4889// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4890static SIMD_INLINE Vec<Short, 32> avg(const Vec<Short, 32> &a,
4891 const Vec<Short, 32> &b)
4892{
4893 // from Agner Fog's VCL vectori128.h
4894 const __m256i signbit = _mm256_set1_epi16(int16_t(0x8000));
4895 const __m256i a1 = _mm256_xor_si256(a, signbit); // add 0x8000
4896 const __m256i b1 = _mm256_xor_si256(b, signbit); // add 0x8000
4897 const __m256i m1 = _mm256_avg_epu16(a1, b1); // unsigned avg
4898 return _mm256_xor_si256(m1, signbit); // sub 0x8000
4899}
4900
4901#else
4902
4903// non-avx2 workaround
4904template <typename T>
4905static SIMD_INLINE Vec<T, 32> avg(const Vec<T, 32> &a, const Vec<T, 32> &b)
4906{
4907 return Vec<T, 32>(avg(a.lo(), b.lo()), avg(a.hi(), b.hi()));
4908}
4909
4910#endif
4911
4912// Paul R at
4913// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4914static SIMD_INLINE Vec<Int, 32> avg(const Vec<Int, 32> &a,
4915 const Vec<Int, 32> &b)
4916{
4917 const auto halfA = srai<1>(a);
4918 const auto halfB = srai<1>(b);
4919 const auto sum = add(halfA, halfB);
4920 const auto lsb = bit_and(bit_or(a, b), set1(Int(1), Integer<32>()));
4921 return add(sum, lsb);
4922}
4923
4924// Paul R at
4925// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4926static SIMD_INLINE Vec<Long, 32> avg(const Vec<Long, 32> &a,
4927 const Vec<Long, 32> &b)
4928{
4929 const auto halfA = srai<1>(a);
4930 const auto halfB = srai<1>(b);
4931 const auto sum = add(halfA, halfB);
4932 const auto lsb = bit_and(bit_or(a, b), set1(Long(1), Integer<32>()));
4933 return add(sum, lsb);
4934}
4935
4936// NOTE: Float version doesn't round!
4937static SIMD_INLINE Vec<Float, 32> avg(const Vec<Float, 32> &a,
4938 const Vec<Float, 32> &b)
4939{
4940 return _mm256_mul_ps(_mm256_add_ps(a, b), _mm256_set1_ps(0.5f));
4941}
4942
4943// NOTE: Double version doesn't round!
4944static SIMD_INLINE Vec<Double, 32> avg(const Vec<Double, 32> &a,
4945 const Vec<Double, 32> &b)
4946{
4947 return _mm256_mul_pd(_mm256_add_pd(a, b), _mm256_set1_pd(0.5));
4948}
4949
4950// ---------------------------------------------------------------------------
4951// test_all_zeros
4952// ---------------------------------------------------------------------------
4953
4954template <typename T>
4955static SIMD_INLINE bool test_all_zeros(const Vec<T, 32> &a)
4956{
4957 const auto intA = reinterpret(a, OutputType<Int>());
4958 return _mm256_testz_si256(intA, intA);
4959}
4960
4961// ---------------------------------------------------------------------------
4962// test_all_ones
4963// ---------------------------------------------------------------------------
4964
4965template <typename T>
4966static SIMD_INLINE bool test_all_ones(const Vec<T, 32> &a)
4967{
4968 const auto intA = reinterpret(a, OutputType<Int>());
4969 return _mm256_testc_si256(intA, _mm256_set1_epi32(-1));
4970}
4971
4972// ---------------------------------------------------------------------------
4973// reverse
4974// ---------------------------------------------------------------------------
4975
4976// All reverse operations below are courtesy of Yannick Sander
4977// modified
4978
4979static SIMD_INLINE Vec<Byte, 32> reverse(const Vec<Byte, 32> &a)
4980{
4981#ifdef __AVX2__
4982 const __m256i mask =
4983 _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
4984 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4985 // _mm256_shuffle_epi8 reverses the upper and lower lane of a individually the
4986 // two lanes have to be swapped as well to perform a full reverse
4987 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
4988 // swap lanes
4989 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
4990#else // AVX fallback
4991 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
4992#endif
4993}
4994
4995static SIMD_INLINE Vec<SignedByte, 32> reverse(const Vec<SignedByte, 32> &a)
4996{
4997#ifdef __AVX2__
4998 const __m256i mask =
4999 _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
5000 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
5001 // _mm256_shuffle_epi8 reverses the upper and lower lane of a individually the
5002 // two lanes have to be swapped as well to perform a full reverse
5003 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
5004 // swap lanes
5005 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
5006#else
5007 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5008#endif
5009}
5010
5011static SIMD_INLINE Vec<Short, 32> reverse(const Vec<Short, 32> &a)
5012{
5013#ifdef __AVX2__
5014 const __m256i mask =
5015 _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17,
5016 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
5017 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
5018 // swap lanes
5019 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
5020#else
5021 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5022#endif
5023}
5024
5025static SIMD_INLINE Vec<Word, 32> reverse(const Vec<Word, 32> &a)
5026{
5027#ifdef __AVX2__
5028 const __m256i mask =
5029 _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17,
5030 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
5031 const __m256i shuffled_lanes = _mm256_shuffle_epi8(a, mask);
5032 // swap lanes
5033 return _mm256_permute4x64_epi64(shuffled_lanes, _MM_SHUFFLE(1, 0, 3, 2));
5034#else
5035 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5036#endif
5037}
5038
5039static SIMD_INLINE Vec<Int, 32> reverse(const Vec<Int, 32> &a)
5040{
5041#ifdef __AVX2__
5042 const __m256i mask = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
5043 return _mm256_permutevar8x32_epi32(a, mask);
5044#else
5045 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5046#endif
5047}
5048
5049static SIMD_INLINE Vec<Long, 32> reverse(const Vec<Long, 32> &a)
5050{
5051#ifdef __AVX2__
5052 const __m256i mask = _mm256_set_epi32(1, 0, 3, 2, 5, 4, 7, 6);
5053 return _mm256_permutevar8x32_epi32(a, mask);
5054#else
5055 return _mm256_set_m128i(reverse(a.lo()), reverse(a.hi()));
5056#endif
5057}
5058
5059static SIMD_INLINE Vec<Float, 32> reverse(const Vec<Float, 32> &a)
5060{
5061#ifdef __AVX2__
5062 const __m256i mask = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
5063 return _mm256_permutevar8x32_ps(a, mask);
5064#else
5065 return _mm256_set_m128(reverse(a.lo()), reverse(a.hi()));
5066#endif
5067}
5068
5069static SIMD_INLINE Vec<Double, 32> reverse(const Vec<Double, 32> &a)
5070{
5071#ifdef __AVX2__
5072 const __m256i mask = _mm256_set_epi32(1, 0, 3, 2, 5, 4, 7, 6);
5073 return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(a), mask));
5074#else
5075 return _mm256_set_m128d(reverse(a.lo()), reverse(a.hi()));
5076#endif
5077}
5078
5079// ---------------------------------------------------------------------------
5080// msb2int
5081// ---------------------------------------------------------------------------
5082
5083// 26. Aug 22 (Jonas Keller): added msb2int functions
5084
5085// 16. Aug 23 (Jonas Keller): fixed bug in msb2int for Byte and SignedByte
5086// caused by trying to cast an int to uint64_t which internally first casts to
5087// int64_t and then to uint64_t, which causes sign extension
5088
5089template <typename T,
5090 SIMD_ENABLE_IF(std::is_integral<T>::value && sizeof(T) == 1)>
5091static SIMD_INLINE uint64_t msb2int(const Vec<T, 32> &a)
5092{
5093 // first convert to uint32_t to avoid sign extension
5094#ifdef __AVX2__
5095 const auto res = _mm256_movemask_epi8(a);
5096#else
5097 const auto res =
5098 _mm_movemask_epi8(a.lo()) | (_mm_movemask_epi8(a.hi()) << 16);
5099#endif
5100 // prevent sign extension when casting to uint64_t by first casting to uint
5101 return uint64_t(uint(res));
5102}
5103
5104template <typename T,
5105 SIMD_ENABLE_IF(std::is_integral<T>::value && sizeof(T) == 2),
5106 typename = void>
5107static SIMD_INLINE uint64_t msb2int(const Vec<T, 32> &a)
5108{
5109 // there is no _mm256_movemask_epi16, so use _mm256_movemask_epi8
5110 // and discard the even bits
5111 // discarding the even bytes in a with a shuffle does not work,
5112 // since shuffle shuffles within 128 bit lanes
5113 // TODO: better way to do this?
5114#ifdef __AVX2__
5115 uint64_t x = _mm256_movemask_epi8(a);
5116#else
5117 uint64_t x = _mm_movemask_epi8(a.lo()) | (_mm_movemask_epi8(a.hi()) << 16);
5118#endif
5119 // idea from: https://stackoverflow.com/a/45695465/8461272
5120 // x = 0b a.b. c.d. e.f. g.h. i.j. k.l. m.n. o.p.
5121 // where a,b,c,d,... are the bits we care about and . represents
5122 // the bits we don't care about
5123
5124 x >>= 1;
5125 // x = 0b .a.b .c.d .e.f .g.h .i.j .k.l .m.n .o.p
5126
5127 x = ((x & 0x44444444) >> 1) | (x & 0x11111111);
5128 // x = 0b ..ab ..cd ..ef ..gh ..ij ..kl ..mn ..op
5129
5130 x = ((x & 0x30303030) >> 2) | (x & 0x03030303);
5131 // x = 0b .... abcd .... efgh .... ijkl .... mnop
5132
5133 x = ((x & 0x0F000F00) >> 4) | (x & 0x000F000F);
5134 // x = 0b .... .... abcd efgh .... .... ijkl mnop
5135
5136 x = ((x & 0x00FF0000) >> 8) | (x & 0x000000FF);
5137 // x = 0b .... .... .... .... abcd efgh ijkl mnop
5138
5139 return x;
5140}
5141
5142static SIMD_INLINE uint64_t msb2int(const Vec<Int, 32> &a)
5143{
5144 return _mm256_movemask_ps(_mm256_castsi256_ps(a));
5145}
5146
5147static SIMD_INLINE uint64_t msb2int(const Vec<Long, 32> &a)
5148{
5149 return _mm256_movemask_pd(_mm256_castsi256_pd(a));
5150}
5151
5152static SIMD_INLINE uint64_t msb2int(const Vec<Float, 32> &a)
5153{
5154 return _mm256_movemask_ps(a);
5155}
5156
5157static SIMD_INLINE uint64_t msb2int(const Vec<Double, 32> &a)
5158{
5159 return _mm256_movemask_pd(a);
5160}
5161
5162// ---------------------------------------------------------------------------
5163// int2msb
5164// ---------------------------------------------------------------------------
5165
5166// 06. Oct 22 (Jonas Keller): added int2msb functions
5167
5168static SIMD_INLINE Vec<Byte, 32> int2msb(const uint64_t a, OutputType<Byte>,
5169 Integer<32>)
5170{
5171#ifdef __AVX2__
5172 const __m256i shuffleIndeces = _mm256_set_epi64x(
5173 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
5174 const __m256i aVec =
5175 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
5176 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
5177 const __m256i selected = _mm256_and_si256(aVec, sel);
5178 const __m256i result = _mm256_cmpeq_epi8(selected, sel);
5179 return _mm256_and_si256(result, _mm256_set1_epi8((int8_t) 0x80));
5180#else
5181 const __m128i shuffleIndeces = _mm_set_epi64x(0x0101010101010101, 0);
5182 const __m128i aVecLo = _mm_shuffle_epi8(_mm_cvtsi32_si128(a), shuffleIndeces);
5183 const __m128i aVecHi =
5184 _mm_shuffle_epi8(_mm_cvtsi32_si128(a >> 16), shuffleIndeces);
5185 const __m128i sel = _mm_set1_epi64x(0x8040201008040201);
5186 const __m128i selectedLo = _mm_and_si128(aVecLo, sel);
5187 const __m128i selectedHi = _mm_and_si128(aVecHi, sel);
5188 const __m128i resultLo = _mm_cmpeq_epi8(selectedLo, sel);
5189 const __m128i resultHi = _mm_cmpeq_epi8(selectedHi, sel);
5190 const __m256i result = _mm256_set_m128i(resultHi, resultLo);
5191 return _mm256_castps_si256(
5192 _mm256_and_ps(_mm256_castsi256_ps(result),
5193 _mm256_castsi256_ps(_mm256_set1_epi8((int8_t) 0x80))));
5194#endif
5195}
5196
5197static SIMD_INLINE Vec<SignedByte, 32> int2msb(const uint64_t a,
5198 OutputType<SignedByte>,
5199 Integer<32>)
5200{
5201 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<32>()),
5202 OutputType<SignedByte>());
5203}
5204
5205static SIMD_INLINE Vec<Short, 32> int2msb(const uint64_t a, OutputType<Short>,
5206 Integer<32>)
5207{
5208#ifdef __AVX2__
5209 const __m256i aVec = _mm256_set1_epi16(a);
5210 const __m256i sel = _mm256_set_epi16(
5211 (int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
5212 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
5213 const __m256i selected = _mm256_and_si256(aVec, sel);
5214 const __m256i result = _mm256_cmpeq_epi16(selected, sel);
5215 return _mm256_and_si256(result, _mm256_set1_epi16((int16_t) 0x8000));
5216#else
5217 const __m128i aVec = _mm_set1_epi16(a);
5218 const __m128i selLo = _mm_set_epi16(0x0080, 0x0040, 0x0020, 0x0010, 0x0008,
5219 0x0004, 0x0002, 0x0001);
5220 const __m128i selHi = _mm_set_epi16((int16_t) 0x8000, 0x4000, 0x2000, 0x1000,
5221 0x0800, 0x0400, 0x0200, 0x0100);
5222 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5223 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5224 const __m128i resultLo = _mm_cmpeq_epi16(selectedLo, selLo);
5225 const __m128i resultHi = _mm_cmpeq_epi16(selectedHi, selHi);
5226 const __m256i result = _mm256_set_m128i(resultHi, resultLo);
5227 return _mm256_castps_si256(
5228 _mm256_and_ps(_mm256_castsi256_ps(result),
5229 _mm256_castsi256_ps(_mm256_set1_epi16((int16_t) 0x8000))));
5230#endif
5231}
5232
5233static SIMD_INLINE Vec<Word, 32> int2msb(const uint64_t a, OutputType<Word>,
5234 Integer<32>)
5235{
5236 return reinterpret(int2msb(a, OutputType<Short>(), Integer<32>()),
5237 OutputType<Word>());
5238}
5239
5240static SIMD_INLINE Vec<Int, 32> int2msb(const uint64_t a, OutputType<Int>,
5241 Integer<32>)
5242{
5243#ifdef __AVX2__
5244 const __m256i aVec = _mm256_set1_epi32(a);
5245 const __m256i sel =
5246 _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
5247 const __m256i selected = _mm256_and_si256(aVec, sel);
5248 const __m256i result = _mm256_cmpeq_epi32(selected, sel);
5249 return _mm256_and_si256(result, _mm256_set1_epi32(0x80000000));
5250#else
5251 const __m128i aVec = _mm_set1_epi32(a);
5252 const __m128i selLo = _mm_set_epi32(0x08, 0x04, 0x02, 0x01);
5253 const __m128i selHi = _mm_set_epi32(0x80, 0x40, 0x20, 0x10);
5254 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5255 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5256 const __m256i result = _mm256_set_m128i(_mm_cmpeq_epi32(selectedHi, selHi),
5257 _mm_cmpeq_epi32(selectedLo, selLo));
5258 return _mm256_castps_si256(
5259 _mm256_and_ps(_mm256_castsi256_ps(result),
5260 _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))));
5261#endif
5262}
5263
5264static SIMD_INLINE Vec<Long, 32> int2msb(const uint64_t a, OutputType<Long>,
5265 Integer<32>)
5266{
5267#ifdef __AVX2__
5268 const __m256i aVec = _mm256_set1_epi64x(a);
5269 const __m256i sel = _mm256_set_epi64x(8, 4, 2, 1);
5270 const __m256i selected = _mm256_and_si256(aVec, sel);
5271 const __m256i result = _mm256_cmpeq_epi64(selected, sel);
5272 return _mm256_and_si256(result, _mm256_set1_epi64x(0x8000000000000000));
5273#else
5274 const __m128i aVec = _mm_set1_epi64x(a);
5275 const __m128i selLo = _mm_set_epi64x(2, 1);
5276 const __m128i selHi = _mm_set_epi64x(8, 4);
5277 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5278 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5279 const __m256i result = _mm256_set_m128i(_mm_cmpeq_epi64(selectedHi, selHi),
5280 _mm_cmpeq_epi64(selectedLo, selLo));
5281 return _mm256_castpd_si256(
5282 _mm256_and_pd(_mm256_castsi256_pd(result),
5283 _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))));
5284#endif
5285}
5286
5287static SIMD_INLINE Vec<Float, 32> int2msb(const uint64_t a, OutputType<Float>,
5288 Integer<32>)
5289{
5290 return reinterpret(int2msb(a, OutputType<Int>(), Integer<32>()),
5291 OutputType<Float>());
5292}
5293
5294static SIMD_INLINE Vec<Double, 32> int2msb(const uint64_t a, OutputType<Double>,
5295 Integer<32>)
5296{
5297 return reinterpret(int2msb(a, OutputType<Long>(), Integer<32>()),
5298 OutputType<Double>());
5299}
5300
5301// ---------------------------------------------------------------------------
5302// int2bits
5303// ---------------------------------------------------------------------------
5304
5305// 09. Oct 22 (Jonas Keller): added int2bits
5306
5307static SIMD_INLINE Vec<Byte, 32> int2bits(const uint64_t a, OutputType<Byte>,
5308 Integer<32>)
5309{
5310#ifdef __AVX2__
5311 const __m256i shuffleIndeces = _mm256_set_epi64x(
5312 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0);
5313 const __m256i aVec =
5314 _mm256_shuffle_epi8(_mm256_set1_epi32(a), shuffleIndeces);
5315 const __m256i sel = _mm256_set1_epi64x(0x8040201008040201);
5316 const __m256i selected = _mm256_and_si256(aVec, sel);
5317 return _mm256_cmpeq_epi8(selected, sel);
5318#else
5319 return _mm256_set_m128i(int2bits(a >> 16, OutputType<Byte>(), Integer<16>()),
5320 int2bits(a, OutputType<Byte>(), Integer<16>()));
5321#endif
5322}
5323
5324static SIMD_INLINE Vec<SignedByte, 32> int2bits(const uint64_t a,
5325 OutputType<SignedByte>,
5326 Integer<32>)
5327{
5328 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<32>()),
5329 OutputType<SignedByte>());
5330}
5331
5332static SIMD_INLINE Vec<Short, 32> int2bits(const uint64_t a, OutputType<Short>,
5333 Integer<32>)
5334{
5335#ifdef __AVX2__
5336 const __m256i aVec = _mm256_set1_epi16(a);
5337 const __m256i sel = _mm256_set_epi16(
5338 (int16_t) 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
5339 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
5340 const __m256i selected = _mm256_and_si256(aVec, sel);
5341 return _mm256_cmpeq_epi16(selected, sel);
5342#else
5343 return _mm256_set_m128i(int2bits(a >> 8, OutputType<Short>(), Integer<16>()),
5344 int2bits(a, OutputType<Short>(), Integer<16>()));
5345#endif
5346}
5347
5348static SIMD_INLINE Vec<Word, 32> int2bits(const uint64_t a, OutputType<Word>,
5349 Integer<32>)
5350{
5351 return reinterpret(int2bits(a, OutputType<Short>(), Integer<32>()),
5352 OutputType<Word>());
5353}
5354
5355static SIMD_INLINE Vec<Int, 32> int2bits(const uint64_t a, OutputType<Int>,
5356 Integer<32>)
5357{
5358#ifdef __AVX2__
5359 const __m256i aVec = _mm256_set1_epi32(a);
5360 const __m256i sel =
5361 _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
5362 const __m256i selected = _mm256_and_si256(aVec, sel);
5363 return _mm256_cmpeq_epi32(selected, sel);
5364#else
5365 return _mm256_set_m128i(int2bits(a >> 4, OutputType<Int>(), Integer<16>()),
5366 int2bits(a, OutputType<Int>(), Integer<16>()));
5367#endif
5368}
5369
5370static SIMD_INLINE Vec<Long, 32> int2bits(const uint64_t a, OutputType<Long>,
5371 Integer<32>)
5372{
5373#ifdef __AVX2__
5374 const __m256i aVec = _mm256_set1_epi64x(a);
5375 const __m256i sel = _mm256_set_epi64x(8, 4, 2, 1);
5376 const __m256i selected = _mm256_and_si256(aVec, sel);
5377 return _mm256_cmpeq_epi64(selected, sel);
5378#else
5379 const __m128i aVec = _mm_set1_epi64x(a);
5380 const __m128i selLo = _mm_set_epi64x(2, 1);
5381 const __m128i selHi = _mm_set_epi64x(8, 4);
5382 const __m128i selectedLo = _mm_and_si128(aVec, selLo);
5383 const __m128i selectedHi = _mm_and_si128(aVec, selHi);
5384 return _mm256_set_m128i(_mm_cmpeq_epi64(selectedHi, selHi),
5385 _mm_cmpeq_epi64(selectedLo, selLo));
5386#endif
5387}
5388
5389static SIMD_INLINE Vec<Float, 32> int2bits(const uint64_t a, OutputType<Float>,
5390 Integer<32>)
5391{
5392 return reinterpret(int2bits(a, OutputType<Int>(), Integer<32>()),
5393 OutputType<Float>());
5394}
5395
5396static SIMD_INLINE Vec<Double, 32> int2bits(const uint64_t a,
5397 OutputType<Double>, Integer<32>)
5398{
5399 return reinterpret(int2bits(a, OutputType<Long>(), Integer<32>()),
5400 OutputType<Double>());
5401}
5402
5403// ---------------------------------------------------------------------------
5404// iota
5405// ---------------------------------------------------------------------------
5406
5407// 30. Jan 23 (Jonas Keller): added iota
5408
5409static SIMD_INLINE Vec<Byte, 32> iota(OutputType<Byte>, Integer<32>)
5410{
5411 return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
5412 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
5413 1, 0);
5414}
5415
5416static SIMD_INLINE Vec<SignedByte, 32> iota(OutputType<SignedByte>, Integer<32>)
5417{
5418 return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
5419 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
5420 1, 0);
5421}
5422
5423static SIMD_INLINE Vec<Short, 32> iota(OutputType<Short>, Integer<32>)
5424{
5425 return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
5426}
5427
5428static SIMD_INLINE Vec<Word, 32> iota(OutputType<Word>, Integer<32>)
5429{
5430 return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
5431}
5432
5433static SIMD_INLINE Vec<Int, 32> iota(OutputType<Int>, Integer<32>)
5434{
5435 return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
5436}
5437
5438static SIMD_INLINE Vec<Long, 32> iota(OutputType<Long>, Integer<32>)
5439{
5440 return _mm256_set_epi64x(3, 2, 1, 0);
5441}
5442
5443static SIMD_INLINE Vec<Float, 32> iota(OutputType<Float>, Integer<32>)
5444{
5445 return _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
5446}
5447
5448static SIMD_INLINE Vec<Double, 32> iota(OutputType<Double>, Integer<32>)
5449{
5450 return _mm256_set_pd(3.0, 2.0, 1.0, 0.0);
5451}
5452
5453} // namespace base
5454} // namespace internal
5455} // namespace simd
5456
5457#endif
5458
5459#endif // SIMD_VEC_BASE_IMPL_INTEL_32_H_
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161