T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
base_impl_intel16.H
1// ===========================================================================
2//
3// encapsulation for SSE Intel vector extensions
4// inspired by Agner Fog's C++ Vector Class Library
5// http://www.agner.org/optimize/#vectorclass
6// (VCL License: GNU General Public License Version 3,
7// http://www.gnu.org/licenses/gpl-3.0.en.html)
8//
9// This source code file is part of the following software:
10//
11// - the low-level C++ template SIMD library
12// - the SIMD implementation of the MinWarping and the 2D-Warping methods
13// for local visual homing.
14//
15// The software is provided based on the accompanying license agreement in the
16// file LICENSE.md.
17// The software is provided "as is" without any warranty by the licensor and
18// without any liability of the licensor, and the software may not be
19// distributed by the licensee; see the license agreement for details.
20//
21// (C) Ralf Möller
22// Computer Engineering
23// Faculty of Technology
24// Bielefeld University
25// www.ti.uni-bielefeld.de
26//
27// ===========================================================================
28
29// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
30// namespace
31// 13. May 23 (Jonas Keller): added Double support
32
33#pragma once
34#ifndef SIMD_VEC_BASE_IMPL_INTEL_16_H_
35#define SIMD_VEC_BASE_IMPL_INTEL_16_H_
36
37#include "../alloc.H"
38#include "../defs.H"
39#include "../types.H"
40#include "../vec.H"
41#include "SSSE3_compat.H"
42#include "intrins_intel.H"
43
44#include <cmath>
45#include <cstddef>
46#include <cstdint>
47#include <limits>
48#include <type_traits>
49
50#if defined(SIMDVEC_INTEL_ENABLE) && defined(_SIMD_VEC_16_AVAIL_) && \
51 !defined(SIMDVEC_SANDBOX)
52
53namespace simd {
54
55// ===========================================================================
56// NOTES:
57//
58// - setting zero inside the function is not inefficient, see:
59// http://stackoverflow.com/questions/26807285/...
60// ...are-static-static-local-sse-avx-variables-blocking-a-xmm-ymm-register
61//
62// - for some data types (Int, Float) there are no saturated versions
63// of add/sub instructions; in this case we use the unsaturated version;
64// the user is responsible to avoid overflows
65//
66// - we could improve performance by using 128-bit instructions from
67// AVX512-VL (e.g. permute instructions); at the moment the idea is that
68// typically the widest vector width is used, so if AVX512 is available,
69// SSE would only rarely be used
70//
71// ===========================================================================
72
73// ===========================================================================
74// Vec integer instantiation for SSE
75// ===========================================================================
76
77// partial specialization for SIMD_WIDTH = 16
78template <typename T>
79class Vec<T, 16>
80{
81 __m128i xmm = _mm_setzero_si128();
82
83public:
84 using Type = T;
85 static constexpr size_t elements = 16 / sizeof(T);
86 static constexpr size_t elems = elements;
87 static constexpr size_t bytes = 16;
88
89 Vec() = default;
90 Vec(const __m128i &x) { xmm = x; }
91 Vec &operator=(const __m128i &x)
92 {
93 xmm = x;
94 return *this;
95 }
96 operator __m128i() const { return xmm; }
97 // 29. Nov 22 (Jonas Keller):
98 // defined operators new and delete to ensure proper alignment, since
99 // the default new and delete are not guaranteed to do so before C++17
100 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
101 void operator delete(void *p) { aligned_free(p); }
102 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
103 void operator delete[](void *p) { aligned_free(p); }
104 // 05. Sep 23 (Jonas Keller): added allocator
105 using allocator = aligned_allocator<Vec<T, bytes>, bytes>;
106};
107
108// ===========================================================================
109// Vec float specialization for SSE
110// ===========================================================================
111
112template <>
113class Vec<Float, 16>
114{
115 __m128 xmm = _mm_setzero_ps();
116
117public:
118 using Type = Float;
119 static constexpr size_t elements = 16 / sizeof(Float);
120 static constexpr size_t elems = elements;
121 static constexpr size_t bytes = 16;
122
123 Vec() = default;
124 Vec(const __m128 &x) { xmm = x; }
125 Vec &operator=(const __m128 &x)
126 {
127 xmm = x;
128 return *this;
129 }
130 operator __m128() const { return xmm; }
131 // 29. Nov 22 (Jonas Keller):
132 // defined operators new and delete to ensure proper alignment, since
133 // the default new and delete are not guaranteed to do so before C++17
134 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
135 void operator delete(void *p) { aligned_free(p); }
136 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
137 void operator delete[](void *p) { aligned_free(p); }
138 // 05. Sep 23 (Jonas Keller): added allocator
139 using allocator = aligned_allocator<Vec<Float, bytes>, bytes>;
140};
141
142// ===========================================================================
143// Vec double specialization for SSE
144// ===========================================================================
145
146template <>
147class Vec<Double, 16>
148{
149 __m128d xmm;
150
151public:
152 using Type = Double;
153 static constexpr size_t elements = 16 / sizeof(Double);
154 static constexpr size_t elems = elements;
155 static constexpr size_t bytes = 16;
156 Vec() = default;
157 Vec(const __m128d &x) { xmm = x; }
158 Vec &operator=(const __m128d &x)
159 {
160 xmm = x;
161 return *this;
162 }
163 operator __m128d() const { return xmm; }
164 void *operator new(size_t size) { return aligned_malloc(bytes, size); }
165 void operator delete(void *p) { aligned_free(p); }
166 void *operator new[](size_t size) { return aligned_malloc(bytes, size); }
167 void operator delete[](void *p) { aligned_free(p); }
168 using allocator = aligned_allocator<Vec<Double, bytes>, bytes>;
169};
170
171namespace internal {
172namespace base {
173// ===========================================================================
174// Vec function template specialization or overloading for SSE
175// ===========================================================================
176
177// ---------------------------------------------------------------------------
178// reinterpretation casts
179// ---------------------------------------------------------------------------
180
181// 08. Apr 23 (Jonas Keller): used enable_if for cleaner implementation
182
183// between all integer types
184template <typename Tdst, typename Tsrc,
185 SIMD_ENABLE_IF((!std::is_same<Tdst, Tsrc>::value &&
186 std::is_integral<Tdst>::value &&
187 std::is_integral<Tsrc>::value))>
188static SIMD_INLINE Vec<Tdst, 16> reinterpret(const Vec<Tsrc, 16> &vec,
189 OutputType<Tdst>)
190{
191 // 26. Nov 22 (Jonas Keller): reinterpret_cast is technically undefined
192 // behavior, so just rewrapping the vector register in a new Vec instead
193 // return reinterpret_cast<const Vec<Tdst,16>&>(vec);
194 return Vec<Tdst, 16>(__m128i(vec));
195}
196
197// from float to any integer type
198template <typename Tdst, SIMD_ENABLE_IF((std::is_integral<Tdst>::value))>
199static SIMD_INLINE Vec<Tdst, 16> reinterpret(const Vec<Float, 16> &vec,
200 OutputType<Tdst>)
201{
202 return _mm_castps_si128(vec);
203}
204
205// from any integer type to float
206template <typename Tsrc, SIMD_ENABLE_IF((std::is_integral<Tsrc>::value))>
207static SIMD_INLINE Vec<Float, 16> reinterpret(const Vec<Tsrc, 16> &vec,
208 OutputType<Float>)
209{
210 return _mm_castsi128_ps(vec);
211}
212
213// from double to any integer type
214template <typename Tdst, SIMD_ENABLE_IF((std::is_integral<Tdst>::value))>
215static SIMD_INLINE Vec<Tdst, 16> reinterpret(const Vec<Double, 16> &vec,
216 OutputType<Tdst>)
217{
218 return _mm_castpd_si128(vec);
219}
220
221// from any integer type to double
222template <typename Tsrc, SIMD_ENABLE_IF((std::is_integral<Tsrc>::value))>
223static SIMD_INLINE Vec<Double, 16> reinterpret(const Vec<Tsrc, 16> &vec,
224 OutputType<Double>)
225{
226 return _mm_castsi128_pd(vec);
227}
228
229// from float to double
230static SIMD_INLINE Vec<Double, 16> reinterpret(const Vec<Float, 16> &vec,
231 OutputType<Double>)
232{
233 return _mm_castps_pd(vec);
234}
235
236// from double to float
237static SIMD_INLINE Vec<Float, 16> reinterpret(const Vec<Double, 16> &vec,
238 OutputType<Float>)
239{
240 return _mm_castpd_ps(vec);
241}
242
243// between identical types
244template <typename T>
245static SIMD_INLINE Vec<T, 16> reinterpret(const Vec<T, 16> &vec, OutputType<T>)
246{
247 return vec;
248}
249
250// ---------------------------------------------------------------------------
251// convert (without changes in the number of of elements)
252// ---------------------------------------------------------------------------
253
254// conversion with saturation; we wanted to have a fast solution that
255// doesn't trigger the overflow which results in a negative two's
256// complement result ("invalid int32": 0x80000000); therefore we clamp
257// the positive values at the maximal positive float which is
258// convertible to int32 without overflow (0x7fffffbf = 2147483520);
259// negative values cannot overflow (they are clamped to invalid int
260// which is the most negative int32)
261static SIMD_INLINE Vec<Int, 16> cvts(const Vec<Float, 16> &a, OutputType<Int>)
262{
263 // TODO: analyze much more complex solution for cvts at
264 // TODO: http://stackoverflow.com/questions/9157373/
265 // TODO: most-efficient-way-to-convert-vector-of-float-to-vector-of-uint32
266 // NOTE: float->int: rounding is affected by MXCSR rounding control bits!
267 __m128 clip = _mm_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT32);
268 return _mm_cvtps_epi32(_mm_min_ps(clip, a));
269}
270
271// saturation is not necessary in this case
272static SIMD_INLINE Vec<Float, 16> cvts(const Vec<Int, 16> &a, OutputType<Float>)
273{
274 return _mm_cvtepi32_ps(a);
275}
276
277static SIMD_INLINE Vec<Long, 16> cvts(const Vec<Double, 16> &a,
278 OutputType<Long>)
279{
280 // _mm_cvtpd_epi64 is only available with AVX512
281 // workaround from https://stackoverflow.com/a/41148578 only works for
282 // values in range [-2^52, 2^52]
283 // using serial workaround instead
284 // TODO: serial workaround is slow, find parallel workaround
285 const auto clip = _mm_set1_pd(MAX_POS_DOUBLE_CONVERTIBLE_TO_INT64);
286 Double tmpD[2] SIMD_ATTR_ALIGNED(16);
287 _mm_store_pd(tmpD, _mm_min_pd(clip, a));
288 Long tmpL[2] SIMD_ATTR_ALIGNED(16);
289 tmpL[0] = Long(std::rint(tmpD[0]));
290 tmpL[1] = Long(std::rint(tmpD[1]));
291 return _mm_load_si128((__m128i *) tmpL);
292}
293
294static SIMD_INLINE Vec<Double, 16> cvts(const Vec<Long, 16> &a,
295 OutputType<Double>)
296{
297 // workaround from https://stackoverflow.com/a/41148578 (modified)
298 __m128i xH = _mm_srai_epi32(a, 16);
299 xH = _mm_and_si128(xH, _mm_set1_epi64x(0xffffffff00000000));
300 xH = _mm_add_epi64(
301 xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
302#ifdef __SSE4_1__
303 __m128i xL = _mm_blend_epi16(
304 a, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52
305#else
306 __m128i xL =
307 _mm_or_si128(_mm_and_si128(a, _mm_set1_epi64x(0x0000ffffffffffff)),
308 _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))); // 2^52
309#endif
310 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH),
311 _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
312 return _mm_add_pd(f, _mm_castsi128_pd(xL));
313}
314
315// ---------------------------------------------------------------------------
316// setzero
317// ---------------------------------------------------------------------------
318
319template <typename T, SIMD_ENABLE_IF(std::is_integral<T>::value)>
320static SIMD_INLINE Vec<T, 16> setzero(OutputType<T>, Integer<16>)
321{
322 return _mm_setzero_si128();
323}
324
325static SIMD_INLINE Vec<Float, 16> setzero(OutputType<Float>, Integer<16>)
326{
327 return _mm_setzero_ps();
328}
329
330static SIMD_INLINE Vec<Double, 16> setzero(OutputType<Double>, Integer<16>)
331{
332 return _mm_setzero_pd();
333}
334
335// ---------------------------------------------------------------------------
336// set1
337// ---------------------------------------------------------------------------
338
339static SIMD_INLINE Vec<Byte, 16> set1(Byte a, Integer<16>)
340{
341 return _mm_set1_epi8(a);
342}
343
344static SIMD_INLINE Vec<SignedByte, 16> set1(SignedByte a, Integer<16>)
345{
346 return _mm_set1_epi8(a);
347}
348
349static SIMD_INLINE Vec<Word, 16> set1(Word a, Integer<16>)
350{
351 return _mm_set1_epi16(a);
352}
353
354static SIMD_INLINE Vec<Short, 16> set1(Short a, Integer<16>)
355{
356 return _mm_set1_epi16(a);
357}
358
359static SIMD_INLINE Vec<Int, 16> set1(Int a, Integer<16>)
360{
361 return _mm_set1_epi32(a);
362}
363
364static SIMD_INLINE Vec<Long, 16> set1(Long a, Integer<16>)
365{
366 return _mm_set1_epi64x(a);
367}
368
369static SIMD_INLINE Vec<Float, 16> set1(Float a, Integer<16>)
370{
371 return _mm_set1_ps(a);
372}
373
374static SIMD_INLINE Vec<Double, 16> set1(Double a, Integer<16>)
375{
376 return _mm_set1_pd(a);
377}
378
379// ---------------------------------------------------------------------------
380// load
381// ---------------------------------------------------------------------------
382
383template <typename T>
384static SIMD_INLINE Vec<T, 16> load(const T *const p, Integer<16>)
385{
386 // SSE load and store instructions need alignment to 16 byte
387 // (lower 4 bit need to be zero)
388 SIMD_CHECK_ALIGNMENT(p, 16);
389 return _mm_load_si128((__m128i *) p);
390}
391
392static SIMD_INLINE Vec<Float, 16> load(const Float *const p, Integer<16>)
393{
394 // SSE load and store instructions need alignment to 16 byte
395 // (lower 4 bit need to be zero)
396 SIMD_CHECK_ALIGNMENT(p, 16);
397 return _mm_load_ps(p);
398}
399
400static SIMD_INLINE Vec<Double, 16> load(const Double *const p, Integer<16>)
401{
402 // SSE load and store instructions need alignment to 16 byte
403 // (lower 4 bit need to be zero)
404 SIMD_CHECK_ALIGNMENT(p, 16);
405 return _mm_load_pd(p);
406}
407
408// ---------------------------------------------------------------------------
409// loadu
410// ---------------------------------------------------------------------------
411
412template <typename T>
413static SIMD_INLINE Vec<T, 16> loadu(const T *const p, Integer<16>)
414{
415 return _mm_loadu_si128((__m128i *) p);
416}
417
418static SIMD_INLINE Vec<Float, 16> loadu(const Float *const p, Integer<16>)
419{
420 return _mm_loadu_ps(p);
421}
422
423static SIMD_INLINE Vec<Double, 16> loadu(const Double *const p, Integer<16>)
424{
425 return _mm_loadu_pd(p);
426}
427
428// ---------------------------------------------------------------------------
429// store
430// ---------------------------------------------------------------------------
431
432// all integer versions
433template <typename T>
434static SIMD_INLINE void store(T *const p, const Vec<T, 16> &a)
435{
436 // SSE load and store instructions need alignment to 16 byte
437 // (lower 4 bit need to be zero)
438 SIMD_CHECK_ALIGNMENT(p, 16);
439 _mm_store_si128((__m128i *) p, a);
440}
441
442// float version
443static SIMD_INLINE void store(Float *const p, const Vec<Float, 16> &a)
444{
445 // SSE load and store instructions need alignment to 16 byte
446 // (lower 4 bit need to be zero)
447 SIMD_CHECK_ALIGNMENT(p, 16);
448 _mm_store_ps(p, a);
449}
450
451// double version
452static SIMD_INLINE void store(Double *const p, const Vec<Double, 16> &a)
453{
454 // SSE load and store instructions need alignment to 16 byte
455 // (lower 4 bit need to be zero)
456 SIMD_CHECK_ALIGNMENT(p, 16);
457 _mm_store_pd(p, a);
458}
459
460// ---------------------------------------------------------------------------
461// storeu
462// ---------------------------------------------------------------------------
463
464// all integer versions
465template <typename T>
466static SIMD_INLINE void storeu(T *const p, const Vec<T, 16> &a)
467{
468 _mm_storeu_si128((__m128i *) p, a);
469}
470
471// float version
472static SIMD_INLINE void storeu(Float *const p, const Vec<Float, 16> &a)
473{
474 _mm_storeu_ps(p, a);
475}
476
477// double version
478static SIMD_INLINE void storeu(Double *const p, const Vec<Double, 16> &a)
479{
480 _mm_storeu_pd(p, a);
481}
482
483// ---------------------------------------------------------------------------
484// stream_store
485// ---------------------------------------------------------------------------
486
487// all integer versions
488template <typename T>
489static SIMD_INLINE void stream_store(T *const p, const Vec<T, 16> &a)
490{
491 // SSE load and store instructions need alignment to 16 byte
492 // (lower 4 bit need to be zero)
493 SIMD_CHECK_ALIGNMENT(p, 16);
494 _mm_stream_si128((__m128i *) p, a);
495}
496
497// float version
498static SIMD_INLINE void stream_store(Float *const p, const Vec<Float, 16> &a)
499{
500 // SSE load and store instructions need alignment to 16 byte
501 // (lower 4 bit need to be zero)
502 SIMD_CHECK_ALIGNMENT(p, 16);
503 _mm_stream_ps(p, a);
504}
505
506// double version
507static SIMD_INLINE void stream_store(Double *const p, const Vec<Double, 16> &a)
508{
509 // SSE load and store instructions need alignment to 16 byte
510 // (lower 4 bit need to be zero)
511 SIMD_CHECK_ALIGNMENT(p, 16);
512 _mm_stream_pd(p, a);
513}
514
515// ---------------------------------------------------------------------------
516// fences (defined only here and not in SIMDVec32.H)
517// ---------------------------------------------------------------------------
518
519static SIMD_INLINE void lfence()
520{
521 _mm_lfence();
522}
523
524static SIMD_INLINE void sfence()
525{
526 _mm_sfence();
527}
528
529static SIMD_INLINE void mfence()
530{
531 _mm_mfence();
532}
533
534// ---------------------------------------------------------------------------
535// extract: with template parameter for immediate argument
536// ---------------------------------------------------------------------------
537
538template <size_t INDEX>
539static SIMD_INLINE Byte extract(const Vec<Byte, 16> &a)
540{
541 SIMD_IF_CONSTEXPR (INDEX == 0) {
542 return _mm_cvtsi128_si32(a);
543 } else SIMD_IF_CONSTEXPR (INDEX < 16) {
544#ifdef __SSE4_1__
545 return _mm_extract_epi8(a, INDEX);
546#else
547 SIMD_IF_CONSTEXPR ((INDEX & 0x1) == 0) {
548 return _mm_extract_epi16(a, INDEX / 2) & 0xff;
549 } else {
550 return _mm_extract_epi16(_mm_srli_epi16(a, 8), INDEX / 2);
551 }
552#endif
553 } else {
554 return 0;
555 }
556}
557
558template <size_t INDEX>
559static SIMD_INLINE SignedByte extract(const Vec<SignedByte, 16> &a)
560{
561 SIMD_IF_CONSTEXPR (INDEX == 0) {
562 return _mm_cvtsi128_si32(a);
563 } else SIMD_IF_CONSTEXPR (INDEX < 16) {
564#ifdef __SSE4_1__
565 return _mm_extract_epi8(a, INDEX);
566#else
567 SIMD_IF_CONSTEXPR ((INDEX & 0x1) == 0) {
568 return _mm_extract_epi16(a, INDEX / 2) & 0xff;
569 } else {
570 return _mm_extract_epi16(_mm_srli_epi16(a, 8), INDEX / 2);
571 }
572#endif
573 } else {
574 return 0;
575 }
576}
577
578template <size_t INDEX>
579static SIMD_INLINE Word extract(const Vec<Word, 16> &a)
580{
581 SIMD_IF_CONSTEXPR (INDEX == 0) {
582 return _mm_cvtsi128_si32(a);
583 } else SIMD_IF_CONSTEXPR (INDEX < 8) {
584 return _mm_extract_epi16(a, INDEX);
585 } else {
586 return 0;
587 }
588}
589
590template <size_t INDEX>
591static SIMD_INLINE Short extract(const Vec<Short, 16> &a)
592{
593 SIMD_IF_CONSTEXPR (INDEX == 0) {
594 return _mm_cvtsi128_si32(a);
595 } else SIMD_IF_CONSTEXPR (INDEX < 8) {
596 return _mm_extract_epi16(a, INDEX);
597 } else {
598 return 0;
599 }
600}
601
602template <size_t INDEX>
603static SIMD_INLINE Int extract(const Vec<Int, 16> &a)
604{
605 SIMD_IF_CONSTEXPR (INDEX == 0) {
606 return _mm_cvtsi128_si32(a);
607 } else SIMD_IF_CONSTEXPR (INDEX < 4) {
608#ifdef __SSE4_1__
609 return _mm_extract_epi32(a, INDEX);
610#else
611 return _mm_cvtsi128_si32(_mm_srli_si128(a, INDEX * 4));
612#endif
613 } else {
614 return 0;
615 }
616}
617
618template <size_t INDEX>
619static SIMD_INLINE Long extract(const Vec<Long, 16> &a)
620{
621 SIMD_IF_CONSTEXPR (INDEX == 0) {
622 return _mm_cvtsi128_si64(a);
623 } else SIMD_IF_CONSTEXPR (INDEX == 1) {
624 return _mm_cvtsi128_si64(_mm_srli_si128(a, 8));
625 } else {
626 return 0;
627 }
628}
629
630template <size_t INDEX>
631static SIMD_INLINE Float extract(const Vec<Float, 16> &a)
632{
633 SIMD_IF_CONSTEXPR (INDEX == 0) {
634 return ::simd::internal::bit_cast<Float>(
635 _mm_cvtsi128_si32(_mm_castps_si128(a)));
636 } else SIMD_IF_CONSTEXPR (INDEX < 4) {
637#ifdef __SSE4_2__
638 const int intRes = _mm_extract_ps(a, INDEX);
639#else
640 const int intRes =
641 _mm_cvtsi128_si32(_mm_srli_si128(_mm_castps_si128(a), INDEX * 4));
642#endif
643 return ::simd::internal::bit_cast<Float>(intRes);
644 } else {
645 return 0.0f;
646 }
647}
648
649template <size_t INDEX>
650static SIMD_INLINE Double extract(const Vec<Double, 16> &a)
651{
652 SIMD_IF_CONSTEXPR (INDEX == 0) {
653 return ::simd::internal::bit_cast<Double>(
654 _mm_cvtsi128_si64(_mm_castpd_si128(a)));
655 } else SIMD_IF_CONSTEXPR (INDEX == 1) {
656 return ::simd::internal::bit_cast<Double>(
657 _mm_cvtsi128_si64(_mm_srli_si128(_mm_castpd_si128(a), 8)));
658 } else {
659 return 0.0;
660 }
661}
662
663// ---------------------------------------------------------------------------
664// ifelse
665// ---------------------------------------------------------------------------
666
667// elements of cond must be all 1's or all 0's (blendv just tests top
668// bit in each byte, but work-around needs this)
669
670template <typename T>
671static SIMD_INLINE Vec<T, 16> ifelse(const Vec<T, 16> &cond,
672 const Vec<T, 16> &trueVal,
673 const Vec<T, 16> &falseVal)
674{
675#ifdef __SSE4_1__
676 return _mm_blendv_epi8(falseVal, trueVal, cond);
677#else
678 return _mm_or_si128(_mm_and_si128(cond, trueVal),
679 _mm_andnot_si128(cond, falseVal));
680#endif
681}
682
683static SIMD_INLINE Vec<Float, 16> ifelse(const Vec<Float, 16> &cond,
684 const Vec<Float, 16> &trueVal,
685 const Vec<Float, 16> &falseVal)
686{
687#ifdef __SSE4_1__
688 return _mm_blendv_ps(falseVal, trueVal, cond);
689#else
690 return _mm_or_ps(_mm_and_ps(cond, trueVal), _mm_andnot_ps(cond, falseVal));
691#endif
692}
693
694static SIMD_INLINE Vec<Double, 16> ifelse(const Vec<Double, 16> &cond,
695 const Vec<Double, 16> &trueVal,
696 const Vec<Double, 16> &falseVal)
697{
698#ifdef __SSE4_1__
699 return _mm_blendv_pd(falseVal, trueVal, cond);
700#else
701 return _mm_or_pd(_mm_and_pd(cond, trueVal), _mm_andnot_pd(cond, falseVal));
702#endif
703}
704
705// ---------------------------------------------------------------------------
706// add
707// ---------------------------------------------------------------------------
708
709static SIMD_INLINE Vec<Byte, 16> add(const Vec<Byte, 16> &a,
710 const Vec<Byte, 16> &b)
711{
712 return _mm_add_epi8(a, b);
713}
714
715static SIMD_INLINE Vec<SignedByte, 16> add(const Vec<SignedByte, 16> &a,
716 const Vec<SignedByte, 16> &b)
717{
718 return _mm_add_epi8(a, b);
719}
720
721static SIMD_INLINE Vec<Word, 16> add(const Vec<Word, 16> &a,
722 const Vec<Word, 16> &b)
723{
724 return _mm_add_epi16(a, b);
725}
726
727static SIMD_INLINE Vec<Short, 16> add(const Vec<Short, 16> &a,
728 const Vec<Short, 16> &b)
729{
730 return _mm_add_epi16(a, b);
731}
732
733static SIMD_INLINE Vec<Int, 16> add(const Vec<Int, 16> &a,
734 const Vec<Int, 16> &b)
735{
736 return _mm_add_epi32(a, b);
737}
738
739static SIMD_INLINE Vec<Long, 16> add(const Vec<Long, 16> &a,
740 const Vec<Long, 16> &b)
741{
742 return _mm_add_epi64(a, b);
743}
744
745static SIMD_INLINE Vec<Float, 16> add(const Vec<Float, 16> &a,
746 const Vec<Float, 16> &b)
747{
748 return _mm_add_ps(a, b);
749}
750
751static SIMD_INLINE Vec<Double, 16> add(const Vec<Double, 16> &a,
752 const Vec<Double, 16> &b)
753{
754 return _mm_add_pd(a, b);
755}
756
757// ---------------------------------------------------------------------------
758// adds
759// ---------------------------------------------------------------------------
760
761static SIMD_INLINE Vec<Byte, 16> adds(const Vec<Byte, 16> &a,
762 const Vec<Byte, 16> &b)
763{
764 return _mm_adds_epu8(a, b);
765}
766
767static SIMD_INLINE Vec<SignedByte, 16> adds(const Vec<SignedByte, 16> &a,
768 const Vec<SignedByte, 16> &b)
769{
770 return _mm_adds_epi8(a, b);
771}
772
773static SIMD_INLINE Vec<Word, 16> adds(const Vec<Word, 16> &a,
774 const Vec<Word, 16> &b)
775{
776 return _mm_adds_epu16(a, b);
777}
778
779static SIMD_INLINE Vec<Short, 16> adds(const Vec<Short, 16> &a,
780 const Vec<Short, 16> &b)
781{
782 return _mm_adds_epi16(a, b);
783}
784
785static SIMD_INLINE Vec<Int, 16> adds(const Vec<Int, 16> &a,
786 const Vec<Int, 16> &b)
787{
788 // 09. Mar 23 (Jonas Keller): added workaround so that this function is
789 // saturated
790
791 // _mm_adds_epi32 does not exist, workaround:
792 // Hacker's Delight, 2-13 Overflow Detection: "Signed integer overflow of
793 // addition occurs if and only if the operands have the same sign and the
794 // sum has a sign opposite to that of the operands."
795 const __m128i sum = _mm_add_epi32(a, b);
796 const __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
797 const __m128i sumHasDiffSign = _mm_xor_si128(a, sum);
798 // indicates when an overflow has occurred
799 const __m128i overflow =
800 _mm_srai_epi32(_mm_andnot_si128(opsHaveDiffSign, sumHasDiffSign), 31);
801 // saturated sum for if overflow occurred (0x7FFFFFFF=max positive int, when
802 // sign of a (and thus b as well) is 0, 0x80000000=min negative int, when sign
803 // of a (and thus b as well) is 1)
804 const __m128i saturatedSum =
805 _mm_xor_si128(_mm_srai_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF));
806 // return saturated sum if overflow occurred, otherwise return sum
807 return ifelse(Vec<Int, 16>(overflow), Vec<Int, 16>(saturatedSum),
808 Vec<Int, 16>(sum));
809}
810
811static SIMD_INLINE Vec<Long, 16> adds(const Vec<Long, 16> &a,
812 const Vec<Long, 16> &b)
813{
814 // _mm_adds_epi64 does not exist, workaround:
815 // Hacker's Delight, 2-13 Overflow Detection: "Signed integer overflow of
816 // addition occurs if and only if the operands have the same sign and the
817 // sum has a sign opposite to that of the operands."
818 __m128i sum = _mm_add_epi64(a, b);
819 __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
820 __m128i sumHasDiffSign = _mm_xor_si128(a, sum);
821 // indicates when an overflow has occurred
822 __m128i overflow32 =
823 _mm_srai_epi32(_mm_andnot_si128(opsHaveDiffSign, sumHasDiffSign), 31);
824 __m128i overflow = _mm_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
825 __m128i signMaskA32 = _mm_srai_epi32(a, 31);
826 __m128i signMaskA = _mm_shuffle_epi32(signMaskA32, _MM_SHUFFLE(3, 3, 1, 1));
827 // saturated sum for if overflow occurred (0x7FFFFFFFFFFFFFFF=max positive
828 // long, when sign of a (and thus b as well) is 0,
829 // 0x8000000000000000=min negative long, when sign of a (and thus b as well)
830 // is 1)
831 __m128i saturatedSum =
832 _mm_xor_si128(signMaskA, _mm_set1_epi64x(0x7FFFFFFFFFFFFFFF));
833 // return saturated sum if overflow occurred, otherwise return sum
834 return ifelse(Vec<Long, 16>(overflow), Vec<Long, 16>(saturatedSum),
835 Vec<Long, 16>(sum));
836}
837
838// Float not saturated
839static SIMD_INLINE Vec<Float, 16> adds(const Vec<Float, 16> &a,
840 const Vec<Float, 16> &b)
841{
842 return _mm_add_ps(a, b);
843}
844
845// Double not saturated
846static SIMD_INLINE Vec<Double, 16> adds(const Vec<Double, 16> &a,
847 const Vec<Double, 16> &b)
848{
849 return _mm_add_pd(a, b);
850}
851
852// ---------------------------------------------------------------------------
853// sub
854// ---------------------------------------------------------------------------
855
856static SIMD_INLINE Vec<Byte, 16> sub(const Vec<Byte, 16> &a,
857 const Vec<Byte, 16> &b)
858{
859 return _mm_sub_epi8(a, b);
860}
861
862static SIMD_INLINE Vec<SignedByte, 16> sub(const Vec<SignedByte, 16> &a,
863 const Vec<SignedByte, 16> &b)
864{
865 return _mm_sub_epi8(a, b);
866}
867
868static SIMD_INLINE Vec<Word, 16> sub(const Vec<Word, 16> &a,
869 const Vec<Word, 16> &b)
870{
871 return _mm_sub_epi16(a, b);
872}
873
874static SIMD_INLINE Vec<Short, 16> sub(const Vec<Short, 16> &a,
875 const Vec<Short, 16> &b)
876{
877 return _mm_sub_epi16(a, b);
878}
879
880static SIMD_INLINE Vec<Int, 16> sub(const Vec<Int, 16> &a,
881 const Vec<Int, 16> &b)
882{
883 return _mm_sub_epi32(a, b);
884}
885
886static SIMD_INLINE Vec<Long, 16> sub(const Vec<Long, 16> &a,
887 const Vec<Long, 16> &b)
888{
889 return _mm_sub_epi64(a, b);
890}
891
892static SIMD_INLINE Vec<Float, 16> sub(const Vec<Float, 16> &a,
893 const Vec<Float, 16> &b)
894{
895 return _mm_sub_ps(a, b);
896}
897
898static SIMD_INLINE Vec<Double, 16> sub(const Vec<Double, 16> &a,
899 const Vec<Double, 16> &b)
900{
901 return _mm_sub_pd(a, b);
902}
903
904// ---------------------------------------------------------------------------
905// subs
906// ---------------------------------------------------------------------------
907
908static SIMD_INLINE Vec<Byte, 16> subs(const Vec<Byte, 16> &a,
909 const Vec<Byte, 16> &b)
910{
911 return _mm_subs_epu8(a, b);
912}
913
914static SIMD_INLINE Vec<SignedByte, 16> subs(const Vec<SignedByte, 16> &a,
915 const Vec<SignedByte, 16> &b)
916{
917 return _mm_subs_epi8(a, b);
918}
919
920static SIMD_INLINE Vec<Word, 16> subs(const Vec<Word, 16> &a,
921 const Vec<Word, 16> &b)
922{
923 return _mm_subs_epu16(a, b);
924}
925
926static SIMD_INLINE Vec<Short, 16> subs(const Vec<Short, 16> &a,
927 const Vec<Short, 16> &b)
928{
929 return _mm_subs_epi16(a, b);
930}
931
932static SIMD_INLINE Vec<Int, 16> subs(const Vec<Int, 16> &a,
933 const Vec<Int, 16> &b)
934{
935 // 09. Mar 23 (Jonas Keller): added workaround so that this function is
936 // saturated
937
938 // _mm_subs_epi32 does not exist, workaround:
939 // Hacker's Delight, 2-13 Overflow Detection: "[...] overflow in the final
940 // value of x−y [...] occurs if and only if x and y have opposite signs and
941 // the sign of x−y [...] is opposite to that of x [...]"
942 const __m128i diff = _mm_sub_epi32(a, b);
943 const __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
944 const __m128i diffHasDiffSign = _mm_xor_si128(a, diff);
945 // indicates when an overflow has occurred
946 const __m128i overflow =
947 _mm_srai_epi32(_mm_and_si128(opsHaveDiffSign, diffHasDiffSign), 31);
948 // saturated diff for if overflow occurred (0x7FFFFFFF=max positive int, when
949 // sign of a (and thus b as well) is 0, 0x80000000=min negative int, when sign
950 // of a (and thus b as well) is 1)
951 const __m128i saturatedDiff =
952 _mm_xor_si128(_mm_srai_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF));
953 // return saturated diff if overflow occurred, otherwise return diff
954 return ifelse(Vec<Int, 16>(overflow), Vec<Int, 16>(saturatedDiff),
955 Vec<Int, 16>(diff));
956}
957
958static SIMD_INLINE Vec<Long, 16> subs(const Vec<Long, 16> &a,
959 const Vec<Long, 16> &b)
960{
961 // _mm_subs_epi64 does not exist, workaround:
962 // Hacker's Delight, 2-13 Overflow Detection: "[...] overflow in the final
963 // value of x−y [...] occurs if and only if x and y have opposite signs and
964 // the sign of x−y [...] is opposite to that of x [...]"
965 __m128i diff = _mm_sub_epi64(a, b);
966 __m128i opsHaveDiffSign = _mm_xor_si128(a, b);
967 __m128i diffHasDiffSign = _mm_xor_si128(a, diff);
968 // indicates when an overflow has occurred
969 __m128i overflow32 =
970 _mm_srai_epi32(_mm_and_si128(opsHaveDiffSign, diffHasDiffSign), 63);
971 __m128i overflow = _mm_shuffle_epi32(overflow32, _MM_SHUFFLE(3, 3, 1, 1));
972 __m128i signMaskA32 = _mm_srai_epi32(a, 63);
973 __m128i signMaskA = _mm_shuffle_epi32(signMaskA32, _MM_SHUFFLE(3, 3, 1, 1));
974 // saturated diff for if overflow occurred (0x7FFFFFFFFFFFFFFF=max positive
975 // long, when sign of a (and thus b as well) is 0,
976 // 0x8000000000000000=min negative long, when sign of a (and thus b as well)
977 // is 1)
978 __m128i saturatedDiff =
979 _mm_xor_si128(signMaskA, _mm_set1_epi64x(0x7FFFFFFFFFFFFFFF));
980 // return saturated diff if overflow occurred, otherwise return diff
981 return ifelse(Vec<Long, 16>(overflow), Vec<Long, 16>(saturatedDiff),
982 Vec<Long, 16>(diff));
983}
984
985// Float not saturated
986static SIMD_INLINE Vec<Float, 16> subs(const Vec<Float, 16> &a,
987 const Vec<Float, 16> &b)
988{
989 return _mm_sub_ps(a, b);
990}
991
992// Double not saturated
993static SIMD_INLINE Vec<Double, 16> subs(const Vec<Double, 16> &a,
994 const Vec<Double, 16> &b)
995{
996 return _mm_sub_pd(a, b);
997}
998
999// ---------------------------------------------------------------------------
1000// neg (negate = two's complement or unary minus), only signed types
1001// ---------------------------------------------------------------------------
1002
1003static SIMD_INLINE Vec<SignedByte, 16> neg(const Vec<SignedByte, 16> &a)
1004{
1005 return _mm_sub_epi8(_mm_setzero_si128(), a);
1006}
1007
1008static SIMD_INLINE Vec<Short, 16> neg(const Vec<Short, 16> &a)
1009{
1010 return _mm_sub_epi16(_mm_setzero_si128(), a);
1011}
1012
1013static SIMD_INLINE Vec<Int, 16> neg(const Vec<Int, 16> &a)
1014{
1015 return _mm_sub_epi32(_mm_setzero_si128(), a);
1016}
1017
1018static SIMD_INLINE Vec<Long, 16> neg(const Vec<Long, 16> &a)
1019{
1020 return _mm_sub_epi64(_mm_setzero_si128(), a);
1021}
1022
1023static SIMD_INLINE Vec<Float, 16> neg(const Vec<Float, 16> &a)
1024{
1025 return _mm_sub_ps(_mm_setzero_ps(), a);
1026}
1027
1028static SIMD_INLINE Vec<Double, 16> neg(const Vec<Double, 16> &a)
1029{
1030 return _mm_xor_pd(a, _mm_set1_pd(-0.0));
1031}
1032
1033// ---------------------------------------------------------------------------
1034// min
1035// ---------------------------------------------------------------------------
1036
1037static SIMD_INLINE Vec<Byte, 16> min(const Vec<Byte, 16> &a,
1038 const Vec<Byte, 16> &b)
1039{
1040 return _mm_min_epu8(a, b);
1041}
1042
1043static SIMD_INLINE Vec<SignedByte, 16> min(const Vec<SignedByte, 16> &a,
1044 const Vec<SignedByte, 16> &b)
1045{
1046#ifdef __SSE4_1__
1047 return _mm_min_epi8(a, b);
1048#else
1049 // from Agner Fog's VCL vectori128.h
1050 const __m128i signbit = _mm_set1_epi32(0x80808080);
1051 const __m128i a1 = _mm_xor_si128(a, signbit); // add 0x80
1052 const __m128i b1 = _mm_xor_si128(b, signbit); // add 0x80
1053 const __m128i m1 = _mm_min_epu8(a1, b1); // unsigned min
1054 return _mm_xor_si128(m1, signbit); // sub 0x80
1055#endif
1056}
1057
1058static SIMD_INLINE Vec<Word, 16> min(const Vec<Word, 16> &a,
1059 const Vec<Word, 16> &b)
1060{
1061#ifdef __SSE4_1__
1062 return _mm_min_epu16(a, b);
1063#else
1064 // from Agner Fog's VCL vectori128.h
1065 const __m128i signbit = _mm_set1_epi32(0x80008000);
1066 const __m128i a1 = _mm_xor_si128(a, signbit); // add 0x8000
1067 const __m128i b1 = _mm_xor_si128(b, signbit); // add 0x8000
1068 const __m128i m1 = _mm_min_epi16(a1, b1); // signed min
1069 return _mm_xor_si128(m1, signbit); // sub 0x8000
1070#endif
1071}
1072
1073static SIMD_INLINE Vec<Short, 16> min(const Vec<Short, 16> &a,
1074 const Vec<Short, 16> &b)
1075{
1076 return _mm_min_epi16(a, b);
1077}
1078
1079static SIMD_INLINE Vec<Int, 16> min(const Vec<Int, 16> &a,
1080 const Vec<Int, 16> &b)
1081{
1082#ifdef __SSE4_1__
1083 return _mm_min_epi32(a, b);
1084#else
1085 // from Agner Fog's VCL vectori128.h (modified)
1086 const __m128i gt = _mm_cmpgt_epi32(a, b);
1087 return _mm_or_si128(_mm_and_si128(gt, b), _mm_andnot_si128(gt, a));
1088#endif
1089}
1090
1091// there is an unsigned version of min for 32 bit but we currently
1092// don't have an element type for it
1093
1094static SIMD_INLINE Vec<Long, 16> min(const Vec<Long, 16> &a,
1095 const Vec<Long, 16> &b)
1096{
1097 // _mm_min_epi64 does not exist (not even in SSE4.1)
1098
1099 // compute a > b into gt
1100#ifdef __SSE4_2__
1101 const __m128i gt = _mm_cmpgt_epi64(a, b);
1102#else
1103 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
1104 const __m128i diff = _mm_sub_epi64(b, a);
1105#if 1 // TODO: check which is faster
1106 const __m128i res = _mm_xor_si128(
1107 diff, _mm_and_si128(_mm_xor_si128(b, a), _mm_xor_si128(diff, b)));
1108#else
1109 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
1110 _mm_andnot_si128(_mm_xor_si128(b, a), diff));
1111#endif
1112 // result in highest bit of res
1113 // spread highest bit to all bits
1114 const __m128i spread32 = _mm_srai_epi32(res, 31);
1115 const __m128i gt = _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1116#endif
1117
1118 // blend a and b according to gt
1119#ifdef __SSE4_1__
1120 return _mm_blendv_epi8(a, b, gt);
1121#else
1122 return _mm_or_si128(_mm_and_si128(gt, b), _mm_andnot_si128(gt, a));
1123#endif
1124}
1125
1126static SIMD_INLINE Vec<Float, 16> min(const Vec<Float, 16> &a,
1127 const Vec<Float, 16> &b)
1128{
1129 return _mm_min_ps(a, b);
1130}
1131
1132static SIMD_INLINE Vec<Double, 16> min(const Vec<Double, 16> &a,
1133 const Vec<Double, 16> &b)
1134{
1135 return _mm_min_pd(a, b);
1136}
1137
1138// ---------------------------------------------------------------------------
1139// max
1140// ---------------------------------------------------------------------------
1141
1142static SIMD_INLINE Vec<Byte, 16> max(const Vec<Byte, 16> &a,
1143 const Vec<Byte, 16> &b)
1144{
1145 return _mm_max_epu8(a, b);
1146}
1147
1148static SIMD_INLINE Vec<SignedByte, 16> max(const Vec<SignedByte, 16> &a,
1149 const Vec<SignedByte, 16> &b)
1150{
1151#ifdef __SSE4_1__
1152 return _mm_max_epi8(a, b);
1153#else
1154 // from Agner Fog's VCL vectori128.h
1155 const __m128i signbit = _mm_set1_epi32(0x80808080);
1156 const __m128i a1 = _mm_xor_si128(a, signbit); // add 0x80
1157 const __m128i b1 = _mm_xor_si128(b, signbit); // add 0x80
1158 const __m128i m1 = _mm_max_epu8(a1, b1); // unsigned max
1159 return _mm_xor_si128(m1, signbit); // sub 0x80
1160#endif
1161}
1162
1163static SIMD_INLINE Vec<Word, 16> max(const Vec<Word, 16> &a,
1164 const Vec<Word, 16> &b)
1165{
1166#ifdef __SSE4_1__
1167 return _mm_max_epu16(a, b);
1168#else
1169 // from Agner Fog's VCL vectori128.h
1170 const __m128i signbit = _mm_set1_epi32(0x80008000);
1171 const __m128i a1 = _mm_xor_si128(a, signbit); // add 0x8000
1172 const __m128i b1 = _mm_xor_si128(b, signbit); // add 0x8000
1173 const __m128i m1 = _mm_max_epi16(a1, b1); // signed max
1174 return _mm_xor_si128(m1, signbit); // sub 0x8000
1175#endif
1176}
1177
1178static SIMD_INLINE Vec<Short, 16> max(const Vec<Short, 16> &a,
1179 const Vec<Short, 16> &b)
1180{
1181 return _mm_max_epi16(a, b);
1182}
1183
1184static SIMD_INLINE Vec<Int, 16> max(const Vec<Int, 16> &a,
1185 const Vec<Int, 16> &b)
1186{
1187#ifdef __SSE4_1__
1188 return _mm_max_epi32(a, b);
1189#else
1190 // from Agner Fog's VCL vectori128.h
1191 const __m128i gt = _mm_cmpgt_epi32(a, b);
1192 return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
1193#endif
1194}
1195
1196// there is an unsigned version of max for 32 bit but we currently
1197// don't have an element type for it
1198
1199static SIMD_INLINE Vec<Long, 16> max(const Vec<Long, 16> &a,
1200 const Vec<Long, 16> &b)
1201{
1202 // _mm_max_epi64 does not exist (not even in SSE4.1)
1203
1204 // compute a > b into gt
1205#ifdef __SSE4_2__
1206 const __m128i gt = _mm_cmpgt_epi64(a, b);
1207#else
1208 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
1209 const __m128i diff = _mm_sub_epi64(b, a);
1210#if 1 // TODO: check which is faster
1211 const __m128i res = _mm_xor_si128(
1212 diff, _mm_and_si128(_mm_xor_si128(b, a), _mm_xor_si128(diff, b)));
1213#else
1214 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
1215 _mm_andnot_si128(_mm_xor_si128(b, a), diff));
1216#endif
1217 // result in highest bit of res
1218 // spread highest bit to all bits
1219 const __m128i spread32 = _mm_srai_epi32(res, 31);
1220 const __m128i gt = _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
1221#endif
1222
1223 // blend a and b according to gt
1224#ifdef __SSE4_1__
1225 return _mm_blendv_epi8(b, a, gt);
1226#else
1227 return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
1228#endif
1229}
1230
1231static SIMD_INLINE Vec<Float, 16> max(const Vec<Float, 16> &a,
1232 const Vec<Float, 16> &b)
1233{
1234 return _mm_max_ps(a, b);
1235}
1236
1237static SIMD_INLINE Vec<Double, 16> max(const Vec<Double, 16> &a,
1238 const Vec<Double, 16> &b)
1239{
1240 return _mm_max_pd(a, b);
1241}
1242
1243// ---------------------------------------------------------------------------
1244// mul, div
1245// ---------------------------------------------------------------------------
1246
1247// TODO: add mul/div versions for int types? or make special versions of mul
1248// TODO: and div where the result is scaled?
1249
1250static SIMD_INLINE Vec<Float, 16> mul(const Vec<Float, 16> &a,
1251 const Vec<Float, 16> &b)
1252{
1253 return _mm_mul_ps(a, b);
1254}
1255
1256static SIMD_INLINE Vec<Double, 16> mul(const Vec<Double, 16> &a,
1257 const Vec<Double, 16> &b)
1258{
1259 return _mm_mul_pd(a, b);
1260}
1261
1262static SIMD_INLINE Vec<Float, 16> div(const Vec<Float, 16> &a,
1263 const Vec<Float, 16> &b)
1264{
1265 return _mm_div_ps(a, b);
1266}
1267
1268static SIMD_INLINE Vec<Double, 16> div(const Vec<Double, 16> &a,
1269 const Vec<Double, 16> &b)
1270{
1271 return _mm_div_pd(a, b);
1272}
1273
1274// ---------------------------------------------------------------------------
1275// ceil, floor, round, truncate
1276// ---------------------------------------------------------------------------
1277
1278// 25. Mar 23 (Jonas Keller): added versions for integer types
1279
1280// TODO: import complex workaround for non-SSE4.1 from Agner Fog's VCL?
1281
1282// NOTE: behavior for workarounds differs for results of -0.0f and +0.0f
1283
1284// work-arounds for round, truncate, floor, and ceil all check whether
1285// rounding is necessary (or whether float is an integer anyhow), this also
1286// prevents range excess when converting numbers to integer
1287
1288// workarounds for floor and ceil:
1289// https://en.wikipedia.org/wiki/Floor_and_ceiling_functions
1290//
1291// floor, ceil:
1292// floor(x), x >= 0
1293// truncate(x) = {
1294// ceil(x), x < 0
1295//
1296// floor(x) = ceil(x) - (x in Z ? 0 : 1)
1297// ceil(x) = floor(x) + (x in Z ? 0 : 1)
1298
1299// versions for integer types do nothing:
1300
1301template <typename T>
1302static SIMD_INLINE Vec<T, 16> ceil(const Vec<T, 16> &a)
1303{
1304 static_assert(std::is_integral<T>::value, "");
1305 return a;
1306}
1307
1308template <typename T>
1309static SIMD_INLINE Vec<T, 16> floor(const Vec<T, 16> &a)
1310{
1311 static_assert(std::is_integral<T>::value, "");
1312 return a;
1313}
1314
1315template <typename T>
1316static SIMD_INLINE Vec<T, 16> round(const Vec<T, 16> &a)
1317{
1318 static_assert(std::is_integral<T>::value, "");
1319 return a;
1320}
1321
1322template <typename T>
1323static SIMD_INLINE Vec<T, 16> truncate(const Vec<T, 16> &a)
1324{
1325 static_assert(std::is_integral<T>::value, "");
1326 return a;
1327}
1328
1329static SIMD_INLINE Vec<Float, 16> ceil(const Vec<Float, 16> &a)
1330{
1331#ifdef __SSE4_1__
1332 return _mm_ceil_ps(a);
1333#else
1334 // if e>=23, floating point number represents an integer, 2^23 = 8388608
1335 const __m128 limit = _mm_set1_ps(8388608.f);
1336 // bool mask: no rounding required if abs(a) >= limit
1337 const __m128 absA =
1338 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1339 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1340 // bool mask: true if a is negative
1341 const __m128 isNeg =
1342 _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1343 // truncated result (for |a| < limit)
1344 __m128 aTrunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
1345 // check if a is an integer
1346 const __m128 isNotInt = _mm_cmpneq_ps(a, aTrunc);
1347 // constant 1.0
1348 const __m128 one = _mm_set1_ps(1.0f);
1349 // mask which is 1.0f for non-negative non-integer values, 0.0f otherwise
1350 const __m128 oneMask = _mm_and_ps(_mm_andnot_ps(isNeg, isNotInt), one);
1351 // if non-negative, trunc computes floor, to turn it into ceil we
1352 // add 1 if aTrunc is non-integer
1353 aTrunc = _mm_add_ps(aTrunc, oneMask);
1354 // select result (a or aTrunc)
1355 return ifelse(noRndReq, a, aTrunc);
1356#endif
1357}
1358
1359static SIMD_INLINE Vec<Double, 16> ceil(const Vec<Double, 16> &a)
1360{
1361#ifdef __SSE4_1__
1362 return _mm_ceil_pd(a);
1363#else
1364 // There is no _mm_cvtepi64_pd in SSE*, which makes the workaround
1365 // used for the float version not possible here.
1366 // Another workaround would probably be complicated and slow, so we just
1367 // ceil serially.
1368 // TODO: is there a better, vectorized workaround?
1369 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1370 _mm_store_pd(inArr, a);
1371 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1372 outArr[0] = std::ceil(inArr[0]);
1373 outArr[1] = std::ceil(inArr[1]);
1374 return _mm_load_pd(outArr);
1375#endif
1376}
1377
1378static SIMD_INLINE Vec<Float, 16> floor(const Vec<Float, 16> &a)
1379{
1380#ifdef __SSE4_1__
1381 return _mm_floor_ps(a);
1382#else
1383 // if e>=23, floating point number represents an integer, 2^23 = 8388608
1384 const __m128 limit = _mm_set1_ps(8388608.f);
1385 // bool mask: no rounding required if abs(a) >= limit
1386 const __m128 absA =
1387 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1388 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1389 // bool mask: true if a is negative
1390 const __m128 isNeg =
1391 _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1392 // truncated result (for |a| < limit)
1393 __m128 aTrunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
1394 // check if a is an integer
1395 const __m128 isNotInt = _mm_cmpneq_ps(a, aTrunc);
1396 // constant 1.0
1397 const __m128 one = _mm_set1_ps(1.0f);
1398 // mask which is 1.0f for negative non-integer values, 0.0f otherwise
1399 const __m128 oneMask = _mm_and_ps(_mm_and_ps(isNeg, isNotInt), one);
1400 // if negative, trunc computes ceil, to turn it into floor we sub
1401 // 1 if aTrunc is non-integer
1402 aTrunc = _mm_sub_ps(aTrunc, oneMask);
1403 // select result (a or aTrunc)
1404 return ifelse(noRndReq, a, aTrunc);
1405#endif
1406}
1407
1408static SIMD_INLINE Vec<Double, 16> floor(const Vec<Double, 16> &a)
1409{
1410#ifdef __SSE4_1__
1411 return _mm_floor_pd(a);
1412#else
1413 // There is no _mm_cvtepi64_pd in SSE*, which makes the workaround
1414 // used for the float version not possible here.
1415 // Another workaround would probably be complicated and slow, so we just
1416 // floor serially.
1417 // TODO: is there a better, vectorized workaround?
1418 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1419 _mm_store_pd(inArr, a);
1420 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1421 outArr[0] = std::floor(inArr[0]);
1422 outArr[1] = std::floor(inArr[1]);
1423 return _mm_load_pd(outArr);
1424#endif
1425}
1426
1427static SIMD_INLINE Vec<Float, 16> round(const Vec<Float, 16> &a)
1428{
1429#ifdef __SSE4_1__
1430 // old: use _MM_SET_ROUNDING_MODE to adjust rounding direction
1431 // return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1432 // new 4. Aug 16 (rm): round to nearest, and suppress exceptions
1433 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1434#else
1435 // NOTE: only works if rounding mode is default (rnd. to nearest (even))
1436 // if e>=23, floating point number represents an integer, 2^23 = 8388608
1437 const __m128 limit = _mm_set1_ps(8388608.f);
1438 // bool mask: no rounding required if abs(a) >= limit
1439 const __m128 absA =
1440 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1441 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1442 // rounded result (here rounded according to current rounding mode)
1443 // (for |a| < limit)
1444 const __m128 aRnd = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
1445 // select result
1446 return ifelse(noRndReq, a, aRnd);
1447#endif
1448}
1449
1450static SIMD_INLINE Vec<Double, 16> round(const Vec<Double, 16> &a)
1451{
1452#ifdef __SSE4_1__
1453 return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1454#else
1455 // There is no _mm_cvtepi64_pd in SSE*, which makes the workaround
1456 // used for the float version not possible here.
1457 // Another workaround would probably be complicated and slow, so we just
1458 // round serially.
1459 // TODO: is there a better, vectorized workaround?
1460 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1461 _mm_store_pd(inArr, a);
1462 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1463 // std::round has different behavior
1464 outArr[0] = std::rint(inArr[0]);
1465 outArr[1] = std::rint(inArr[1]);
1466 return _mm_load_pd(outArr);
1467#endif
1468}
1469
1470static SIMD_INLINE Vec<Float, 16> truncate(const Vec<Float, 16> &a)
1471{
1472#ifdef __SSE4_1__
1473 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1474#else
1475 // if e>=23, floating point number represents an integer, 2^23 = 8388608
1476 const __m128 limit = _mm_set1_ps(8388608.f);
1477 // bool mask: no rounding required if abs(a) >= limit
1478 const __m128 absA =
1479 _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1480 const __m128 noRndReq = _mm_cmpge_ps(absA, limit);
1481 // truncated result (for |a| < limit) (cvtTps!)
1482 const __m128 aTrunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
1483 // select result
1484 return ifelse(noRndReq, a, aTrunc);
1485#endif
1486}
1487
1488static SIMD_INLINE Vec<Double, 16> truncate(const Vec<Double, 16> &a)
1489{
1490#ifdef __SSE4_1__
1491 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1492#else
1493 // There is no _mm_cvtepi64_pd in SSE*, which makes the workaround
1494 // used for the float version not possible here.
1495 // Another workaround would probably be complicated and slow, so we just
1496 // truncate serially.
1497 // TODO: is there a better, vectorized workaround?
1498 Double inArr[2] SIMD_ATTR_ALIGNED(16);
1499 _mm_store_pd(inArr, a);
1500 Double outArr[2] SIMD_ATTR_ALIGNED(16);
1501 outArr[0] = std::trunc(inArr[0]);
1502 outArr[1] = std::trunc(inArr[1]);
1503 return _mm_load_pd(outArr);
1504#endif
1505}
1506
1507// ---------------------------------------------------------------------------
1508// elementary mathematical functions
1509// ---------------------------------------------------------------------------
1510
1511// estimate of a reciprocal
1512static SIMD_INLINE Vec<Float, 16> rcp(const Vec<Float, 16> &a)
1513{
1514 return _mm_rcp_ps(a);
1515}
1516
1517// estimate of a reciprocal
1518static SIMD_INLINE Vec<Double, 16> rcp(const Vec<Double, 16> &a)
1519{
1520 // _mm_rcp_pd does not exist, use _mm_div_pd instead
1521 return _mm_div_pd(_mm_set1_pd(1.0), a);
1522}
1523
1524// estimate of a reverse square root
1525static SIMD_INLINE Vec<Float, 16> rsqrt(const Vec<Float, 16> &a)
1526{
1527 return _mm_rsqrt_ps(a);
1528}
1529
1530// estimate of a reverse square root
1531static SIMD_INLINE Vec<Double, 16> rsqrt(const Vec<Double, 16> &a)
1532{
1533 // _mm_rsqrt_pd does not exist, use _mm_div_pd and _mm_sqrt_pd instead
1534 return _mm_div_pd(_mm_set1_pd(1.0), _mm_sqrt_pd(a));
1535}
1536
1537// square root
1538static SIMD_INLINE Vec<Float, 16> sqrt(const Vec<Float, 16> &a)
1539{
1540 return _mm_sqrt_ps(a);
1541}
1542
1543// square root
1544static SIMD_INLINE Vec<Double, 16> sqrt(const Vec<Double, 16> &a)
1545{
1546 return _mm_sqrt_pd(a);
1547}
1548
1549// ---------------------------------------------------------------------------
1550// abs
1551// ---------------------------------------------------------------------------
1552
1553// 25. Mar 25 (Jonas Keller): added abs for unsigned integers
1554
1555// unsigned integers
1556template <typename T, SIMD_ENABLE_IF(std::is_unsigned<T>::value
1557 &&std::is_integral<T>::value)>
1558static SIMD_INLINE Vec<T, 16> abs(const Vec<T, 16> &a)
1559{
1560 return a;
1561}
1562
1563static SIMD_INLINE Vec<SignedByte, 16> abs(const Vec<SignedByte, 16> &a)
1564{
1565 return _mm_abs_epi8(a);
1566}
1567
1568static SIMD_INLINE Vec<Short, 16> abs(const Vec<Short, 16> &a)
1569{
1570 return _mm_abs_epi16(a);
1571}
1572
1573static SIMD_INLINE Vec<Int, 16> abs(const Vec<Int, 16> &a)
1574{
1575 return _mm_abs_epi32(a);
1576}
1577
1578static SIMD_INLINE Vec<Long, 16> abs(const Vec<Long, 16> &a)
1579{
1580 // _mm_abs_epi64 is only supported in avx512
1581 // from Hacker's Delight, 2-4 Absolute Value Function:
1582 const __m128i signMask =
1583 _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1));
1584 return _mm_sub_epi64(_mm_xor_si128(a, signMask), signMask);
1585}
1586
1587static SIMD_INLINE Vec<Float, 16> abs(const Vec<Float, 16> &a)
1588{
1589 return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
1590}
1591
1592static SIMD_INLINE Vec<Double, 16> abs(const Vec<Double, 16> &a)
1593{
1594 return _mm_and_pd(a, _mm_castsi128_pd(_mm_set1_epi64x(0x7FFFFFFFFFFFFFFF)));
1595}
1596
1597// ---------------------------------------------------------------------------
1598// unpacklo
1599// ---------------------------------------------------------------------------
1600
1601// all integer versions
1602template <typename T>
1603static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1604 Part<0>, Bytes<1>)
1605{
1606 return _mm_unpacklo_epi8(a, b);
1607}
1608
1609// all integer versions
1610template <typename T>
1611static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1612 Part<0>, Bytes<2>)
1613{
1614 return _mm_unpacklo_epi16(a, b);
1615}
1616
1617// all integer versions
1618template <typename T>
1619static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1620 Part<0>, Bytes<4>)
1621{
1622 return _mm_unpacklo_epi32(a, b);
1623}
1624
1625// all integer versions
1626template <typename T>
1627static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1628 Part<0>, Bytes<8>)
1629{
1630 return _mm_unpacklo_epi64(a, b);
1631}
1632
1633// float version
1634static SIMD_INLINE Vec<Float, 16> unpack(const Vec<Float, 16> &a,
1635 const Vec<Float, 16> &b, Part<0>,
1636 Bytes<4>)
1637{
1638 return _mm_unpacklo_ps(a, b);
1639}
1640
1641// float version
1642static SIMD_INLINE Vec<Float, 16> unpack(const Vec<Float, 16> &a,
1643 const Vec<Float, 16> &b, Part<0>,
1644 Bytes<8>)
1645{
1646 // this moves two lower floats from a and b
1647 return _mm_movelh_ps(a, b);
1648}
1649
1650// double version
1651static SIMD_INLINE Vec<Double, 16> unpack(const Vec<Double, 16> &a,
1652 const Vec<Double, 16> &b, Part<0>,
1653 Bytes<8>)
1654{
1655 return _mm_unpacklo_pd(a, b);
1656}
1657
1658// ---------------------------------------------------------------------------
1659// unpackhi
1660// ---------------------------------------------------------------------------
1661
1662// all integer versions
1663template <typename T>
1664static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1665 Part<1>, Bytes<1>)
1666{
1667 return _mm_unpackhi_epi8(a, b);
1668}
1669
1670// all integer versions
1671template <typename T>
1672static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1673 Part<1>, Bytes<2>)
1674{
1675 return _mm_unpackhi_epi16(a, b);
1676}
1677
1678// all integer versions
1679template <typename T>
1680static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1681 Part<1>, Bytes<4>)
1682{
1683 return _mm_unpackhi_epi32(a, b);
1684}
1685
1686// all integer versions
1687template <typename T>
1688static SIMD_INLINE Vec<T, 16> unpack(const Vec<T, 16> &a, const Vec<T, 16> &b,
1689 Part<1>, Bytes<8>)
1690{
1691 return _mm_unpackhi_epi64(a, b);
1692}
1693
1694// float version
1695static SIMD_INLINE Vec<Float, 16> unpack(const Vec<Float, 16> &a,
1696 const Vec<Float, 16> &b, Part<1>,
1697 Bytes<4>)
1698{
1699 return _mm_unpackhi_ps(a, b);
1700}
1701
1702// float version
1703static SIMD_INLINE Vec<Float, 16> unpack(const Vec<Float, 16> &a,
1704 const Vec<Float, 16> &b, Part<1>,
1705 Bytes<8>)
1706{
1707 // this moves two upper floats from a and b
1708 // order b, a
1709 return _mm_movehl_ps(b, a);
1710}
1711
1712// double version
1713static SIMD_INLINE Vec<Double, 16> unpack(const Vec<Double, 16> &a,
1714 const Vec<Double, 16> &b, Part<1>,
1715 Bytes<8>)
1716{
1717 return _mm_unpackhi_pd(a, b);
1718}
1719
1720// contributed by Adam Marschall
1721
1722// 16-byte-lane oriented unpack: for 16 bytes same as generalized unpack
1723// unpack blocks of NUM_ELEMS elements of type T
1724// PART=0: low half of input vectors,
1725// PART=1: high half of input vectors
1726template <size_t PART, size_t BYTES, typename T>
1727static SIMD_INLINE Vec<T, 16> unpack16(const Vec<T, 16> &a, const Vec<T, 16> &b,
1728 Part<PART>, Bytes<BYTES>)
1729{
1730 return unpack(a, b, Part<PART>(), Bytes<BYTES>());
1731}
1732
1733// ---------------------------------------------------------------------------
1734// extract 128-bit lane as Vec<T, 16>, does nothing for 16 bytes
1735// ---------------------------------------------------------------------------
1736
1737// contributed by Adam Marschall
1738
1739template <size_t LANE_INDEX, typename T>
1740static SIMD_INLINE Vec<T, 16> extractLane(const Vec<T, 16> &a)
1741{
1742 return a;
1743}
1744
1745// ---------------------------------------------------------------------------
1746// zip (two unpacks similar to ARM NEON vzip, but for different NUM_ELEMS)
1747// ---------------------------------------------------------------------------
1748
1749// a, b are passed by-value to avoid problems with identical input/output args.
1750
1751// here we can directly map zip to unpack<PART,NUM_ELEMS,T>
1752template <size_t NUM_ELEMS, typename T>
1753static SIMD_INLINE void zip(const Vec<T, 16> a, const Vec<T, 16> b,
1754 Vec<T, 16> &l, Vec<T, 16> &h)
1755{
1756 l = unpack(a, b, Part<0>(), Bytes<NUM_ELEMS * sizeof(T)>());
1757 h = unpack(a, b, Part<1>(), Bytes<NUM_ELEMS * sizeof(T)>());
1758}
1759
1760// ---------------------------------------------------------------------------
1761// zip16 hub (16-byte-lane oriented zip): for 16 bytes same as zip
1762// ---------------------------------------------------------------------------
1763
1764// contributed by Adam Marschall
1765
1766// a, b are passed by-value to avoid problems with identical
1767// input/output args.
1768
1769template <size_t NUM_ELEMS, typename T>
1770static SIMD_INLINE void zip16(const Vec<T, 16> a, const Vec<T, 16> b,
1771 Vec<T, 16> &l, Vec<T, 16> &h)
1772{
1773 zip<NUM_ELEMS, T>(a, b, l, h);
1774}
1775
1776// ---------------------------------------------------------------------------
1777// unzip (similar to ARM NEON vuzp, but for different NUM_ELEMS)
1778// ---------------------------------------------------------------------------
1779
1780// solutions by Peter Cordes and Starvin Marvin:
1781// stackoverflow.com/q/45376193/3852630 and
1782// stackoverflow.com/a/45385216/3852630 and
1783// stackoverflow.com/q/20504618/3852630
1784
1785// all integer versions
1786template <typename T>
1787static SIMD_INLINE void unzip(const Vec<T, 16> a, const Vec<T, 16> b,
1788 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<1>)
1789{
1790 // mask is hopefully only set once if unzip is used multiple times
1791 const __m128i mask =
1792 _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
1793 const __m128i atmp = _mm_shuffle_epi8(a, mask);
1794 const __m128i btmp = _mm_shuffle_epi8(b, mask);
1795 l = _mm_unpacklo_epi64(atmp, btmp);
1796 h = _mm_unpackhi_epi64(atmp, btmp);
1797}
1798
1799// all integer versions
1800template <typename T>
1801static SIMD_INLINE void unzip(const Vec<T, 16> a, const Vec<T, 16> b,
1802 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<2>)
1803{
1804 // mask is hopefully only set once if unzip is used multiple times
1805 const __m128i mask =
1806 _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
1807 const __m128i atmp = _mm_shuffle_epi8(a, mask);
1808 const __m128i btmp = _mm_shuffle_epi8(b, mask);
1809 l = _mm_unpacklo_epi64(atmp, btmp);
1810 h = _mm_unpackhi_epi64(atmp, btmp);
1811}
1812
1813// all integer versions
1814template <typename T>
1815static SIMD_INLINE void unzip(const Vec<T, 16> a, const Vec<T, 16> b,
1816 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<4>)
1817{
1818 const __m128 aps = _mm_castsi128_ps(a);
1819 const __m128 bps = _mm_castsi128_ps(b);
1820 l = _mm_castps_si128(_mm_shuffle_ps(aps, bps, _MM_SHUFFLE(2, 0, 2, 0)));
1821 h = _mm_castps_si128(_mm_shuffle_ps(aps, bps, _MM_SHUFFLE(3, 1, 3, 1)));
1822}
1823
1824// all types
1825template <typename T>
1826static SIMD_INLINE void unzip(const Vec<T, 16> a, const Vec<T, 16> b,
1827 Vec<T, 16> &l, Vec<T, 16> &h, Bytes<8>)
1828{
1829 l = unpack(a, b, Part<0>(), Bytes<8>());
1830 h = unpack(a, b, Part<1>(), Bytes<8>());
1831}
1832
1833// Float
1834static SIMD_INLINE void unzip(const Vec<Float, 16> a, const Vec<Float, 16> b,
1835 Vec<Float, 16> &l, Vec<Float, 16> &h, Bytes<4>)
1836{
1837 l = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
1838 h = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
1839}
1840
1841// ---------------------------------------------------------------------------
1842// packs
1843// ---------------------------------------------------------------------------
1844
1845// signed -> signed
1846
1847static SIMD_INLINE Vec<SignedByte, 16> packs(const Vec<Short, 16> &a,
1848 const Vec<Short, 16> &b,
1849 OutputType<SignedByte>)
1850{
1851 return _mm_packs_epi16(a, b);
1852}
1853
1854static SIMD_INLINE Vec<Short, 16> packs(const Vec<Int, 16> &a,
1855 const Vec<Int, 16> &b,
1856 OutputType<Short>)
1857{
1858 return _mm_packs_epi32(a, b);
1859}
1860
1861static SIMD_INLINE Vec<Short, 16> packs(const Vec<Float, 16> &a,
1862 const Vec<Float, 16> &b,
1863 OutputType<Short>)
1864{
1865 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
1866 OutputType<Short>());
1867}
1868
1869static SIMD_INLINE Vec<Float, 16> packs(const Vec<Long, 16> &a,
1870 const Vec<Long, 16> &b,
1871 OutputType<Float>)
1872{
1873 // _mm_cvtepi64_ps does not exist
1874 return _mm_shuffle_ps(_mm_cvtpd_ps(cvts(a, OutputType<Double>())),
1875 _mm_cvtpd_ps(cvts(b, OutputType<Double>())),
1876 _MM_SHUFFLE(1, 0, 1, 0));
1877}
1878
1879static SIMD_INLINE Vec<Float, 16> packs(const Vec<Double, 16> &a,
1880 const Vec<Double, 16> &b,
1881 OutputType<Float>)
1882{
1883 return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b),
1884 _MM_SHUFFLE(1, 0, 1, 0));
1885}
1886
1887// Long to Int
1888
1889static SIMD_INLINE Vec<Int, 16> packs(const Vec<Long, 16> &a,
1890 const Vec<Long, 16> &b, OutputType<Int>)
1891{
1892 // _mm_packs_epi64 does not exist
1893 // vectorized workaround seems to be complicated, so just using serial
1894 // workaround
1895 // TODO: is there a better, vectorized workaround?
1896
1897 Long input[4] SIMD_ATTR_ALIGNED(16);
1898 _mm_store_si128((__m128i *) input, a);
1899 _mm_store_si128((__m128i *) (input + 2), b);
1900 Int output[4] SIMD_ATTR_ALIGNED(16);
1901 for (int i = 0; i < 4; ++i) {
1902 output[i] =
1903 (Int) std::min(std::max(input[i], (Long) std::numeric_limits<Int>::min()),
1904 (Long) std::numeric_limits<Int>::max());
1905 }
1906 return _mm_load_si128((__m128i *) output);
1907}
1908
1909// Double to Int
1910
1911static SIMD_INLINE Vec<Int, 16> packs(const Vec<Double, 16> &a,
1912 const Vec<Double, 16> &b, OutputType<Int>)
1913{
1914 const __m128d clip = _mm_set1_pd(std::numeric_limits<Int>::max());
1915 const __m128 bI = _mm_castsi128_ps(_mm_cvtpd_epi32(_mm_min_pd(clip, b)));
1916 const __m128 aI = _mm_castsi128_ps(_mm_cvtpd_epi32(_mm_min_pd(clip, a)));
1917 return _mm_castps_si128(_mm_shuffle_ps(aI, bI, _MM_SHUFFLE(1, 0, 1, 0)));
1918}
1919
1920// unsigned -> unsigned
1921
1922static SIMD_INLINE Vec<Byte, 16> packs(const Vec<Word, 16> &a,
1923 const Vec<Word, 16> &b, OutputType<Byte>)
1924{
1925 // _mm_packus_epu16 does not exist, so saturate inputs to byte range and then
1926 // use _mm_packus_epi16
1927 return _mm_packus_epi16(min(a, Vec<Word, 16>(_mm_set1_epi16(0xff))),
1928 min(b, Vec<Word, 16>(_mm_set1_epi16(0xff))));
1929}
1930
1931// signed -> unsigned
1932
1933static SIMD_INLINE Vec<Byte, 16> packs(const Vec<Short, 16> &a,
1934 const Vec<Short, 16> &b,
1935 OutputType<Byte>)
1936{
1937 return _mm_packus_epi16(a, b);
1938}
1939
1940static SIMD_INLINE Vec<Word, 16> packs(const Vec<Int, 16> &a,
1941 const Vec<Int, 16> &b, OutputType<Word>)
1942{
1943#ifdef __SSE4_1__
1944 return _mm_packus_epi32(a, b);
1945#else
1946 // mask for lower 16 bit
1947 const __m128i mask = _mm_set1_epi32(0x0000ffff);
1948 // a >= 0 ? asat = a : asat = 0
1949 // 23. Nov 17 (rm): 32->31
1950 __m128i asat = _mm_andnot_si128(_mm_srai_epi32(a, 31), a);
1951 // cmp/or is used to restrict number to 16 bit
1952 // srai/slli is used for sign extension of 16 bit number,
1953 // makes signed saturation (in packs) a no-op, see
1954 // http://stackoverflow.com/questions/12118910/
1955 // converting-float-vector-to-16-bit-int-without-saturating
1956 // e.g.
1957 // a = 0xffffffff (-1) -> asat = 0x00000000
1958 // -> cmpgt = 0x00000000
1959 // -> slli = 0x00000000
1960 // -> or = 0x00000000
1961 // -> srai = 0x00000000
1962 // -> packs = 0x0000
1963 // a = 0x7fffffff (>=0) -> asat = 0x7fffffff
1964 // -> cmpgt = 0xffffffff
1965 // -> slli = 0xffff0000
1966 // -> or = 0xffffffff
1967 // -> srai = 0xffffffff
1968 // -> packs = 0xffff
1969 // a = 0x0000ffff (>=0) -> asat = 0x0000ffff
1970 // -> cmpgt = 0x00000000
1971 // -> slli = 0xffff0000
1972 // -> or = 0xffff0000
1973 // -> srai = 0xffffffff
1974 // -> packs = 0xffff
1975 // a = 0x0000fffe (>=0) -> asat = 0x0000fffe
1976 // -> cmpgt = 0x00000000
1977 // -> slli = 0xfffe0000
1978 // -> or = 0xfffe0000
1979 // -> srai = 0xfffffffe
1980 // -> packs = 0xfffe
1981 // a = 0x00007fff (>=0) -> asat = 0x00007fff
1982 // -> cmpgt = 0x00000000
1983 // -> slli = 0x7fff0000
1984 // -> or = 0x7fff0000
1985 // -> srai = 0x00007fff
1986 // -> packs = 0x7fff
1987 asat = _mm_srai_epi32(
1988 _mm_or_si128(_mm_slli_epi32(asat, 16), _mm_cmpgt_epi32(asat, mask)), 16);
1989 // same for b
1990 // 23. Nov 17 (rm): 32->31
1991 __m128i bsat = _mm_andnot_si128(_mm_srai_epi32(b, 31), b);
1992 bsat = _mm_srai_epi32(
1993 _mm_or_si128(_mm_slli_epi32(bsat, 16), _mm_cmpgt_epi32(bsat, mask)), 16);
1994 return _mm_packs_epi32(asat, bsat);
1995#endif
1996}
1997
1998static SIMD_INLINE Vec<Word, 16> packs(const Vec<Float, 16> &a,
1999 const Vec<Float, 16> &b,
2000 OutputType<Word>)
2001{
2002 return packs(cvts(a, OutputType<Int>()), cvts(b, OutputType<Int>()),
2003 OutputType<Word>());
2004}
2005
2006// unsigned -> signed
2007
2008static SIMD_INLINE Vec<SignedByte, 16> packs(const Vec<Word, 16> &a,
2009 const Vec<Word, 16> &b,
2010 OutputType<SignedByte>)
2011{
2012 // _mm_packs_epu16 does not exist, so saturate inputs to signed byte range and
2013 // then use _mm_packs_epi16
2014 return _mm_packs_epi16(min(a, Vec<Word, 16>(_mm_set1_epi16(0x7f))),
2015 min(b, Vec<Word, 16>(_mm_set1_epi16(0x7f))));
2016}
2017
2018// ---------------------------------------------------------------------------
2019// generalized extend: no stage
2020// ---------------------------------------------------------------------------
2021
2022// combinations:
2023// - signed -> extended signed (sign extension)
2024// - unsigned -> extended unsigned (zero extension)
2025// - unsigned -> extended signed (zero extension)
2026// - signed -> extended unsigned (saturation and zero extension)
2027
2028// same types
2029template <typename T>
2030static SIMD_INLINE void extend(const Vec<T, 16> &vIn, Vec<T, 16> vOut[1])
2031{
2032 vOut[0] = vIn;
2033}
2034
2035// same size, different types
2036
2037static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2038 Vec<Byte, 16> vOut[1])
2039{
2040 vOut[0] = max(vIn, _mm_setzero_si128());
2041}
2042
2043static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn,
2044 Vec<SignedByte, 16> vOut[1])
2045{
2046 vOut[0] = min(vIn, _mm_set1_epi8(0x7f));
2047}
2048
2049static SIMD_INLINE void extend(const Vec<Short, 16> &vIn, Vec<Word, 16> vOut[1])
2050{
2051 vOut[0] = _mm_max_epi16(vIn, _mm_setzero_si128());
2052}
2053
2054static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Short, 16> vOut[1])
2055{
2056 vOut[0] = min(vIn, _mm_set1_epi16(0x7fff));
2057}
2058
2059// ---------------------------------------------------------------------------
2060// generalized extend: single stage
2061// ---------------------------------------------------------------------------
2062
2063// signed -> signed
2064
2065static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2066 Vec<Short, 16> vOut[2])
2067{
2068#ifdef __SSE4_1__
2069 vOut[0] = _mm_cvtepi8_epi16(vIn);
2070 vOut[1] = _mm_cvtepi8_epi16(_mm_srli_si128(vIn, 8));
2071#else
2072 vOut[0] = _mm_srai_epi16(_mm_unpacklo_epi8(_mm_undefined_si128(), vIn), 8);
2073 vOut[1] = _mm_srai_epi16(_mm_unpackhi_epi8(_mm_undefined_si128(), vIn), 8);
2074#endif
2075}
2076
2077static SIMD_INLINE void extend(const Vec<Short, 16> &vIn, Vec<Int, 16> vOut[2])
2078{
2079#ifdef __SSE4_1__
2080 vOut[0] = _mm_cvtepi16_epi32(vIn);
2081 vOut[1] = _mm_cvtepi16_epi32(_mm_srli_si128(vIn, 8));
2082#else
2083 vOut[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_undefined_si128(), vIn), 16);
2084 vOut[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_undefined_si128(), vIn), 16);
2085#endif
2086}
2087
2088static SIMD_INLINE void extend(const Vec<Short, 16> &vIn,
2089 Vec<Float, 16> vOut[2])
2090{
2091#ifdef __SSE4_1__
2092 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(vIn));
2093 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 8)));
2094#else
2095 vOut[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(vIn, vIn), 16));
2096 vOut[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(vIn, vIn), 16));
2097#endif
2098}
2099
2100// unsigned -> unsigned
2101
2102static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Word, 16> vOut[2])
2103{
2104 // there's no _mm_cvtepu8_epu16()
2105 vOut[0] = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2106 vOut[1] = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2107}
2108
2109// unsigned -> signed
2110
2111static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Short, 16> vOut[2])
2112{
2113#ifdef __SSE4_1__
2114 vOut[0] = _mm_cvtepu8_epi16(vIn);
2115 vOut[1] = _mm_cvtepu8_epi16(_mm_srli_si128(vIn, 8));
2116#else
2117 vOut[0] = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2118 vOut[1] = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2119#endif
2120}
2121
2122static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Int, 16> vOut[2])
2123{
2124#ifdef __SSE4_1__
2125 vOut[0] = _mm_cvtepu16_epi32(vIn);
2126 vOut[1] = _mm_cvtepu16_epi32(_mm_srli_si128(vIn, 8));
2127#else
2128 vOut[0] = _mm_unpacklo_epi16(vIn, _mm_setzero_si128());
2129 vOut[1] = _mm_unpackhi_epi16(vIn, _mm_setzero_si128());
2130#endif
2131}
2132
2133static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Float, 16> vOut[2])
2134{
2135#ifdef __SSE4_1__
2136 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(vIn));
2137 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 8)));
2138#else
2139 vOut[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(vIn, _mm_setzero_si128()));
2140 vOut[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(vIn, _mm_setzero_si128()));
2141#endif
2142}
2143
2144static SIMD_INLINE void extend(const Vec<Int, 16> &vIn, Vec<Long, 16> vOut[2])
2145{
2146#ifdef __SSE4_1__
2147 vOut[0] = _mm_cvtepi32_epi64(vIn);
2148 vOut[1] = _mm_cvtepi32_epi64(_mm_srli_si128(vIn, 8));
2149#else
2150 const __m128i sign = _mm_srai_epi32(vIn, 31);
2151 vOut[0] = _mm_unpacklo_epi32(vIn, sign);
2152 vOut[1] = _mm_unpackhi_epi32(vIn, sign);
2153#endif
2154}
2155
2156static SIMD_INLINE void extend(const Vec<Int, 16> &vIn, Vec<Double, 16> vOut[2])
2157{
2158 vOut[0] = _mm_cvtepi32_pd(vIn);
2159 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(vIn, 8));
2160}
2161
2162static SIMD_INLINE void extend(const Vec<Float, 16> &vIn, Vec<Long, 16> vOut[2])
2163{
2164 const auto clipped =
2165 _mm_min_ps(vIn, _mm_set1_ps(MAX_POS_FLOAT_CONVERTIBLE_TO_INT64));
2166 vOut[0] = cvts(_mm_cvtps_pd(clipped), OutputType<Long>());
2167 vOut[1] = cvts(_mm_cvtps_pd(_mm_castsi128_ps(
2168 _mm_srli_si128(_mm_castps_si128(clipped), 8))),
2169 OutputType<Long>());
2170}
2171
2172static SIMD_INLINE void extend(const Vec<Float, 16> &vIn,
2173 Vec<Double, 16> vOut[2])
2174{
2175 vOut[0] = _mm_cvtps_pd(vIn);
2176 vOut[1] =
2177 _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(vIn), 8)));
2178}
2179
2180// signed -> unsigned
2181
2182static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2183 Vec<Word, 16> vOut[2])
2184{
2185 // bring input in positive range
2186#ifdef __SSE4_1__
2187 const __m128i vInPos = _mm_max_epi8(vIn, _mm_setzero_si128());
2188#else
2189 // from Agner Fog's VCL vectori128.h
2190 const __m128i signbit = _mm_set1_epi32(0x80808080);
2191 const __m128i a1 = _mm_xor_si128(vIn, signbit); // add 0x80
2192 const __m128i m1 = _mm_max_epu8(a1, signbit); // unsigned max
2193 const __m128i vInPos = _mm_xor_si128(m1, signbit); // sub 0x80
2194#endif
2195 vOut[0] = _mm_unpacklo_epi8(vInPos, _mm_setzero_si128());
2196 vOut[1] = _mm_unpackhi_epi8(vInPos, _mm_setzero_si128());
2197}
2198
2199// ---------------------------------------------------------------------------
2200// generalized extend: two stages
2201// ---------------------------------------------------------------------------
2202
2203// signed -> signed
2204
2205static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2206 Vec<Int, 16> vOut[4])
2207{
2208#ifdef __SSE4_1__
2209 vOut[0] = _mm_cvtepi8_epi32(vIn);
2210 vOut[1] = _mm_cvtepi8_epi32(_mm_srli_si128(vIn, 4));
2211 vOut[2] = _mm_cvtepi8_epi32(_mm_srli_si128(vIn, 8));
2212 vOut[3] = _mm_cvtepi8_epi32(_mm_srli_si128(vIn, 12));
2213#else
2214 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2215 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2216 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2217 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2218 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2219 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2220 vOut[0] = _mm_srai_epi32(lolo16, 24);
2221 vOut[1] = _mm_srai_epi32(lohi16, 24);
2222 vOut[2] = _mm_srai_epi32(hilo16, 24);
2223 vOut[3] = _mm_srai_epi32(hihi16, 24);
2224#endif
2225}
2226
2227static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2228 Vec<Float, 16> vOut[4])
2229{
2230#ifdef __SSE4_1__
2231 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(vIn));
2232 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 4)));
2233 vOut[2] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 8)));
2234 vOut[3] = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 12)));
2235#else
2236 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2237 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2238 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2239 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2240 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2241 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2242 vOut[0] = _mm_cvtepi32_ps(_mm_srai_epi32(lolo16, 24));
2243 vOut[1] = _mm_cvtepi32_ps(_mm_srai_epi32(lohi16, 24));
2244 vOut[2] = _mm_cvtepi32_ps(_mm_srai_epi32(hilo16, 24));
2245 vOut[3] = _mm_cvtepi32_ps(_mm_srai_epi32(hihi16, 24));
2246#endif
2247}
2248
2249static SIMD_INLINE void extend(const Vec<Short, 16> &vIn, Vec<Long, 16> vOut[4])
2250{
2251#ifdef __SSE4_1__
2252 vOut[0] = _mm_cvtepi16_epi64(vIn);
2253 vOut[1] = _mm_cvtepi16_epi64(_mm_srli_si128(vIn, 4));
2254 vOut[2] = _mm_cvtepi16_epi64(_mm_srli_si128(vIn, 8));
2255 vOut[3] = _mm_cvtepi16_epi64(_mm_srli_si128(vIn, 12));
2256#else
2257 const __m128i lo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), vIn);
2258 const __m128i hi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), vIn);
2259 const __m128i lo16ext = _mm_srai_epi32(lo16, 16);
2260 const __m128i hi16ext = _mm_srai_epi32(hi16, 16);
2261 const __m128i lo16sign = _mm_srai_epi32(lo16, 31);
2262 const __m128i hi16sign = _mm_srai_epi32(hi16, 31);
2263 vOut[0] = _mm_unpacklo_epi32(lo16ext, lo16sign);
2264 vOut[1] = _mm_unpackhi_epi32(lo16ext, lo16sign);
2265 vOut[2] = _mm_unpacklo_epi32(hi16ext, hi16sign);
2266 vOut[3] = _mm_unpackhi_epi32(hi16ext, hi16sign);
2267#endif
2268}
2269
2270static SIMD_INLINE void extend(const Vec<Short, 16> &vIn,
2271 Vec<Double, 16> vOut[4])
2272{
2273#ifdef __SSE4_1__
2274 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(vIn));
2275 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 4)));
2276 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 8)));
2277 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepi16_epi32(_mm_srli_si128(vIn, 12)));
2278#else
2279 const __m128i lo16 =
2280 _mm_srai_epi32(_mm_unpacklo_epi16(_mm_undefined_si128(), vIn), 16);
2281 const __m128i hi16 =
2282 _mm_srai_epi32(_mm_unpackhi_epi16(_mm_undefined_si128(), vIn), 16);
2283 vOut[0] = _mm_cvtepi32_pd(lo16);
2284 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lo16, 8));
2285 vOut[2] = _mm_cvtepi32_pd(hi16);
2286 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(hi16, 8));
2287#endif
2288}
2289
2290// unsigned -> signed
2291
2292static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Int, 16> vOut[4])
2293{
2294#ifdef __SSE4_1__
2295 vOut[0] = _mm_cvtepu8_epi32(vIn);
2296 vOut[1] = _mm_cvtepu8_epi32(_mm_srli_si128(vIn, 4));
2297 vOut[2] = _mm_cvtepu8_epi32(_mm_srli_si128(vIn, 8));
2298 vOut[3] = _mm_cvtepu8_epi32(_mm_srli_si128(vIn, 12));
2299#else
2300 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2301 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2302 vOut[0] = _mm_unpacklo_epi16(lo8, _mm_setzero_si128());
2303 vOut[1] = _mm_unpackhi_epi16(lo8, _mm_setzero_si128());
2304 vOut[2] = _mm_unpacklo_epi16(hi8, _mm_setzero_si128());
2305 vOut[3] = _mm_unpackhi_epi16(hi8, _mm_setzero_si128());
2306#endif
2307}
2308
2309static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Float, 16> vOut[4])
2310{
2311#ifdef __SSE4_1__
2312 vOut[0] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(vIn));
2313 vOut[1] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 4)));
2314 vOut[2] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 8)));
2315 vOut[3] = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 12)));
2316#else
2317 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2318 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2319 vOut[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo8, _mm_setzero_si128()));
2320 vOut[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo8, _mm_setzero_si128()));
2321 vOut[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi8, _mm_setzero_si128()));
2322 vOut[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi8, _mm_setzero_si128()));
2323#endif
2324}
2325
2326static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Long, 16> vOut[4])
2327{
2328#ifdef __SSE4_1__
2329 vOut[0] = _mm_cvtepu16_epi64(vIn);
2330 vOut[1] = _mm_cvtepu16_epi64(_mm_srli_si128(vIn, 4));
2331 vOut[2] = _mm_cvtepu16_epi64(_mm_srli_si128(vIn, 8));
2332 vOut[3] = _mm_cvtepu16_epi64(_mm_srli_si128(vIn, 12));
2333#else
2334 const __m128i lo16 = _mm_unpacklo_epi16(vIn, _mm_setzero_si128());
2335 const __m128i hi16 = _mm_unpackhi_epi16(vIn, _mm_setzero_si128());
2336 vOut[0] = _mm_unpacklo_epi32(lo16, _mm_setzero_si128());
2337 vOut[1] = _mm_unpackhi_epi32(lo16, _mm_setzero_si128());
2338 vOut[2] = _mm_unpacklo_epi32(hi16, _mm_setzero_si128());
2339 vOut[3] = _mm_unpackhi_epi32(hi16, _mm_setzero_si128());
2340#endif
2341}
2342
2343static SIMD_INLINE void extend(const Vec<Word, 16> &vIn,
2344 Vec<Double, 16> vOut[4])
2345{
2346#ifdef __SSE4_1__
2347 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(vIn));
2348 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 4)));
2349 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 8)));
2350 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_srli_si128(vIn, 12)));
2351#else
2352 const __m128i lo16 = _mm_unpacklo_epi16(vIn, _mm_setzero_si128());
2353 const __m128i hi16 = _mm_unpackhi_epi16(vIn, _mm_setzero_si128());
2354 vOut[0] = _mm_cvtepi32_pd(lo16);
2355 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lo16, 8));
2356 vOut[2] = _mm_cvtepi32_pd(hi16);
2357 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(hi16, 8));
2358#endif
2359}
2360
2361// ---------------------------------------------------------------------------
2362// generalized extend: three stages
2363// ---------------------------------------------------------------------------
2364
2365// signed -> signed
2366
2367static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2368 Vec<Long, 16> vOut[8])
2369{
2370#ifdef __SSE4_1__
2371 vOut[0] = _mm_cvtepi8_epi64(vIn);
2372 vOut[1] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 2));
2373 vOut[2] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 4));
2374 vOut[3] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 6));
2375 vOut[4] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 8));
2376 vOut[5] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 10));
2377 vOut[6] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 12));
2378 vOut[7] = _mm_cvtepi8_epi64(_mm_srli_si128(vIn, 14));
2379#else
2380 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2381 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2382 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2383 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2384 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2385 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2386 const __m128i lolo16ext = _mm_srai_epi32(lolo16, 24);
2387 const __m128i lohi16ext = _mm_srai_epi32(lohi16, 24);
2388 const __m128i hilo16ext = _mm_srai_epi32(hilo16, 24);
2389 const __m128i hihi16ext = _mm_srai_epi32(hihi16, 24);
2390 const __m128i lolo16sign = _mm_srai_epi32(lolo16, 31);
2391 const __m128i lohi16sign = _mm_srai_epi32(lohi16, 31);
2392 const __m128i hilo16sign = _mm_srai_epi32(hilo16, 31);
2393 const __m128i hihi16sign = _mm_srai_epi32(hihi16, 31);
2394 vOut[0] = _mm_unpacklo_epi32(lolo16ext, lolo16sign);
2395 vOut[1] = _mm_unpackhi_epi32(lolo16ext, lolo16sign);
2396 vOut[2] = _mm_unpacklo_epi32(lohi16ext, lohi16sign);
2397 vOut[3] = _mm_unpackhi_epi32(lohi16ext, lohi16sign);
2398 vOut[4] = _mm_unpacklo_epi32(hilo16ext, hilo16sign);
2399 vOut[5] = _mm_unpackhi_epi32(hilo16ext, hilo16sign);
2400 vOut[6] = _mm_unpacklo_epi32(hihi16ext, hihi16sign);
2401 vOut[7] = _mm_unpackhi_epi32(hihi16ext, hihi16sign);
2402#endif
2403}
2404
2405static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
2406 Vec<Double, 16> vOut[8])
2407{
2408#ifdef __SSE4_1__
2409 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(vIn));
2410 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 2)));
2411 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 4)));
2412 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 6)));
2413 vOut[4] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 8)));
2414 vOut[5] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 10)));
2415 vOut[6] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 12)));
2416 vOut[7] = _mm_cvtepi32_pd(_mm_cvtepi8_epi32(_mm_srli_si128(vIn, 14)));
2417#else
2418 const __m128i lo8 = _mm_unpacklo_epi8(_mm_undefined_si128(), vIn);
2419 const __m128i hi8 = _mm_unpackhi_epi8(_mm_undefined_si128(), vIn);
2420 const __m128i lolo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), lo8);
2421 const __m128i lohi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), lo8);
2422 const __m128i hilo16 = _mm_unpacklo_epi16(_mm_undefined_si128(), hi8);
2423 const __m128i hihi16 = _mm_unpackhi_epi16(_mm_undefined_si128(), hi8);
2424 const __m128i lolo16ext = _mm_srai_epi32(lolo16, 24);
2425 const __m128i lohi16ext = _mm_srai_epi32(lohi16, 24);
2426 const __m128i hilo16ext = _mm_srai_epi32(hilo16, 24);
2427 const __m128i hihi16ext = _mm_srai_epi32(hihi16, 24);
2428 vOut[0] = _mm_cvtepi32_pd(lolo16ext);
2429 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lolo16ext, 8));
2430 vOut[2] = _mm_cvtepi32_pd(lohi16ext);
2431 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(lohi16ext, 8));
2432 vOut[4] = _mm_cvtepi32_pd(hilo16ext);
2433 vOut[5] = _mm_cvtepi32_pd(_mm_srli_si128(hilo16ext, 8));
2434 vOut[6] = _mm_cvtepi32_pd(hihi16ext);
2435 vOut[7] = _mm_cvtepi32_pd(_mm_srli_si128(hihi16ext, 8));
2436#endif
2437}
2438
2439// unsigned -> signed
2440
2441static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Long, 16> vOut[8])
2442{
2443#ifdef __SSE4_1__
2444 vOut[0] = _mm_cvtepu8_epi64(vIn);
2445 vOut[1] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 2));
2446 vOut[2] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 4));
2447 vOut[3] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 6));
2448 vOut[4] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 8));
2449 vOut[5] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 10));
2450 vOut[6] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 12));
2451 vOut[7] = _mm_cvtepu8_epi64(_mm_srli_si128(vIn, 14));
2452#else
2453 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2454 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2455 const __m128i lolo16 = _mm_unpacklo_epi16(lo8, _mm_setzero_si128());
2456 const __m128i lohi16 = _mm_unpackhi_epi16(lo8, _mm_setzero_si128());
2457 const __m128i hilo16 = _mm_unpacklo_epi16(hi8, _mm_setzero_si128());
2458 const __m128i hihi16 = _mm_unpackhi_epi16(hi8, _mm_setzero_si128());
2459 vOut[0] = _mm_unpacklo_epi32(lolo16, _mm_setzero_si128());
2460 vOut[1] = _mm_unpackhi_epi32(lolo16, _mm_setzero_si128());
2461 vOut[2] = _mm_unpacklo_epi32(lohi16, _mm_setzero_si128());
2462 vOut[3] = _mm_unpackhi_epi32(lohi16, _mm_setzero_si128());
2463 vOut[4] = _mm_unpacklo_epi32(hilo16, _mm_setzero_si128());
2464 vOut[5] = _mm_unpackhi_epi32(hilo16, _mm_setzero_si128());
2465 vOut[6] = _mm_unpacklo_epi32(hihi16, _mm_setzero_si128());
2466 vOut[7] = _mm_unpackhi_epi32(hihi16, _mm_setzero_si128());
2467#endif
2468}
2469
2470static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn,
2471 Vec<Double, 16> vOut[8])
2472{
2473#ifdef __SSE4_1__
2474 vOut[0] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(vIn));
2475 vOut[1] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 2)));
2476 vOut[2] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 4)));
2477 vOut[3] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 6)));
2478 vOut[4] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 8)));
2479 vOut[5] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 10)));
2480 vOut[6] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 12)));
2481 vOut[7] = _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_srli_si128(vIn, 14)));
2482#else
2483 const __m128i lo8 = _mm_unpacklo_epi8(vIn, _mm_setzero_si128());
2484 const __m128i hi8 = _mm_unpackhi_epi8(vIn, _mm_setzero_si128());
2485 const __m128i lolo16 = _mm_unpacklo_epi16(lo8, _mm_setzero_si128());
2486 const __m128i lohi16 = _mm_unpackhi_epi16(lo8, _mm_setzero_si128());
2487 const __m128i hilo16 = _mm_unpacklo_epi16(hi8, _mm_setzero_si128());
2488 const __m128i hihi16 = _mm_unpackhi_epi16(hi8, _mm_setzero_si128());
2489 vOut[0] = _mm_cvtepi32_pd(lolo16);
2490 vOut[1] = _mm_cvtepi32_pd(_mm_srli_si128(lolo16, 8));
2491 vOut[2] = _mm_cvtepi32_pd(lohi16);
2492 vOut[3] = _mm_cvtepi32_pd(_mm_srli_si128(lohi16, 8));
2493 vOut[4] = _mm_cvtepi32_pd(hilo16);
2494 vOut[5] = _mm_cvtepi32_pd(_mm_srli_si128(hilo16, 8));
2495 vOut[6] = _mm_cvtepi32_pd(hihi16);
2496 vOut[7] = _mm_cvtepi32_pd(_mm_srli_si128(hihi16, 8));
2497#endif
2498}
2499
2500// ---------------------------------------------------------------------------
2501// generalized extend: special case int <-> float, long <-> double
2502// ---------------------------------------------------------------------------
2503
2504template <typename Tout, typename Tin,
2505 SIMD_ENABLE_IF(sizeof(Tin) == sizeof(Tout) &&
2506 std::is_floating_point<Tin>::value !=
2507 std::is_floating_point<Tout>::value)>
2508static SIMD_INLINE void extend(const Vec<Tin, 16> &vIn, Vec<Tout, 16> vOut[1])
2509{
2510 vOut[0] = cvts(vIn, OutputType<Tout>());
2511}
2512
2513// ---------------------------------------------------------------------------
2514// srai
2515// ---------------------------------------------------------------------------
2516
2517// 16. Oct 22 (Jonas Keller): added missing Byte and SignedByte versions
2518
2519template <size_t COUNT>
2520static SIMD_INLINE Vec<Byte, 16> srai(const Vec<Byte, 16> &a)
2521{
2522 SIMD_IF_CONSTEXPR (COUNT < 8) {
2523 const __m128i odd = _mm_srai_epi16(a, COUNT);
2524 const __m128i even = _mm_srai_epi16(_mm_slli_epi16(a, 8), COUNT + 8);
2525 const __m128i odd_masked =
2526 _mm_and_si128(odd, _mm_set1_epi16((int16_t) 0xFF00));
2527 const __m128i even_masked = _mm_and_si128(even, _mm_set1_epi16(0x00FF));
2528 return _mm_or_si128(odd_masked, even_masked);
2529 } else {
2530 // result should be all ones if a is negative, all zeros otherwise
2531 return _mm_cmplt_epi8(a, _mm_setzero_si128());
2532 }
2533}
2534
2535template <size_t COUNT>
2536static SIMD_INLINE Vec<SignedByte, 16> srai(const Vec<SignedByte, 16> &a)
2537{
2538 return reinterpret(srai<COUNT>(reinterpret(a, OutputType<Byte>())),
2539 OutputType<SignedByte>());
2540}
2541
2542template <size_t COUNT>
2543static SIMD_INLINE Vec<Word, 16> srai(const Vec<Word, 16> &a)
2544{
2545 return _mm_srai_epi16(a, vec::min(COUNT, 15ul));
2546}
2547
2548template <size_t COUNT>
2549static SIMD_INLINE Vec<Short, 16> srai(const Vec<Short, 16> &a)
2550{
2551 return _mm_srai_epi16(a, vec::min(COUNT, 15ul));
2552}
2553
2554template <size_t COUNT>
2555static SIMD_INLINE Vec<Int, 16> srai(const Vec<Int, 16> &a)
2556{
2557 return _mm_srai_epi32(a, vec::min(COUNT, 31ul));
2558}
2559
2560template <size_t COUNT>
2561static SIMD_INLINE Vec<Long, 16> srai(const Vec<Long, 16> &a)
2562{
2563 // _mm_srai_epi64 is not available
2564 // workaround from Hacker's Delight, 2–17 Double-Length Shifts, Shift right
2565 // double signed:
2566 const __m128i odd = _mm_srai_epi32(a, vec::min(COUNT, 31ul));
2567 __m128i even;
2568 SIMD_IF_CONSTEXPR (COUNT < 32) {
2569 even = _mm_or_si128(_mm_srli_epi32(a, COUNT),
2570 _mm_slli_epi32(_mm_srli_si128(a, 4), 32 - COUNT));
2571 } else {
2572 even = _mm_srai_epi32(_mm_srli_si128(a, 4), vec::min(COUNT - 32, 31ul));
2573 }
2574#ifdef __SSE4_1__
2575 return _mm_blend_epi16(even, odd, 0xcc);
2576#else
2577 return _mm_or_si128(_mm_and_si128(even, _mm_set1_epi64x(0x00000000FFFFFFFF)),
2578 _mm_and_si128(odd, _mm_set1_epi64x(0xFFFFFFFF00000000)));
2579#endif
2580}
2581
2582// ---------------------------------------------------------------------------
2583// srli
2584// ---------------------------------------------------------------------------
2585
2586// https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2587// License: not specified
2588template <size_t COUNT>
2589static SIMD_INLINE Vec<Byte, 16> srli(const Vec<Byte, 16> &a)
2590{
2591 SIMD_IF_CONSTEXPR (COUNT < 8) {
2592 return _mm_and_si128(_mm_set1_epi8((int8_t) (0xff >> COUNT)),
2593 _mm_srli_epi32(a, COUNT));
2594 } else {
2595 return _mm_setzero_si128();
2596 }
2597}
2598
2599// https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2600// License: not specified
2601template <size_t COUNT>
2602static SIMD_INLINE Vec<SignedByte, 16> srli(const Vec<SignedByte, 16> &a)
2603{
2604 SIMD_IF_CONSTEXPR (COUNT < 8) {
2605 return _mm_and_si128(_mm_set1_epi8((int8_t) (0xff >> COUNT)),
2606 _mm_srli_epi32(a, COUNT));
2607 } else {
2608 return _mm_setzero_si128();
2609 }
2610}
2611
2612template <size_t COUNT>
2613static SIMD_INLINE Vec<Word, 16> srli(const Vec<Word, 16> &a)
2614{
2615 SIMD_IF_CONSTEXPR (COUNT < 16) {
2616 return _mm_srli_epi16(a, COUNT);
2617 } else {
2618 return _mm_setzero_si128();
2619 }
2620}
2621
2622template <size_t COUNT>
2623static SIMD_INLINE Vec<Short, 16> srli(const Vec<Short, 16> &a)
2624{
2625 SIMD_IF_CONSTEXPR (COUNT < 16) {
2626 return _mm_srli_epi16(a, COUNT);
2627 } else {
2628 return _mm_setzero_si128();
2629 }
2630}
2631
2632template <size_t COUNT>
2633static SIMD_INLINE Vec<Int, 16> srli(const Vec<Int, 16> &a)
2634{
2635 SIMD_IF_CONSTEXPR (COUNT < 32) {
2636 return _mm_srli_epi32(a, COUNT);
2637 } else {
2638 return _mm_setzero_si128();
2639 }
2640}
2641
2642template <size_t COUNT>
2643static SIMD_INLINE Vec<Long, 16> srli(const Vec<Long, 16> &a)
2644{
2645 SIMD_IF_CONSTEXPR (COUNT < 64) {
2646 return _mm_srli_epi64(a, COUNT);
2647 } else {
2648 return _mm_setzero_si128();
2649 }
2650}
2651
2652// ---------------------------------------------------------------------------
2653// slli
2654// ---------------------------------------------------------------------------
2655
2656// https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2657// License: not specified
2658template <size_t COUNT>
2659static SIMD_INLINE Vec<Byte, 16> slli(const Vec<Byte, 16> &a)
2660{
2661 SIMD_IF_CONSTEXPR (COUNT < 8) {
2662 return _mm_and_si128(
2663 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2664 _mm_slli_epi32(a, COUNT));
2665 } else {
2666 return _mm_setzero_si128();
2667 }
2668}
2669
2670// https://github.com/grumpos/spu_intrin/blob/master/src/sse_extensions.h
2671// License: not specified
2672template <size_t COUNT>
2673static SIMD_INLINE Vec<SignedByte, 16> slli(const Vec<SignedByte, 16> &a)
2674{
2675 SIMD_IF_CONSTEXPR (COUNT < 8) {
2676 return _mm_and_si128(
2677 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << COUNT))),
2678 _mm_slli_epi32(a, COUNT));
2679 } else {
2680 return _mm_setzero_si128();
2681 }
2682}
2683
2684template <size_t COUNT>
2685static SIMD_INLINE Vec<Word, 16> slli(const Vec<Word, 16> &a)
2686{
2687 SIMD_IF_CONSTEXPR (COUNT < 16) {
2688 return _mm_slli_epi16(a, COUNT);
2689 } else {
2690 return _mm_setzero_si128();
2691 }
2692}
2693
2694template <size_t COUNT>
2695static SIMD_INLINE Vec<Short, 16> slli(const Vec<Short, 16> &a)
2696{
2697 SIMD_IF_CONSTEXPR (COUNT < 16) {
2698 return _mm_slli_epi16(a, COUNT);
2699 } else {
2700 return _mm_setzero_si128();
2701 }
2702}
2703
2704template <size_t COUNT>
2705static SIMD_INLINE Vec<Int, 16> slli(const Vec<Int, 16> &a)
2706{
2707 SIMD_IF_CONSTEXPR (COUNT < 32) {
2708 return _mm_slli_epi32(a, COUNT);
2709 } else {
2710 return _mm_setzero_si128();
2711 }
2712}
2713
2714template <size_t COUNT>
2715static SIMD_INLINE Vec<Long, 16> slli(const Vec<Long, 16> &a)
2716{
2717 SIMD_IF_CONSTEXPR (COUNT < 64) {
2718 return _mm_slli_epi64(a, COUNT);
2719 } else {
2720 return _mm_setzero_si128();
2721 }
2722}
2723
2724// 19. Dec 22 (Jonas Keller): added sra, srl and sll functions
2725
2726// ---------------------------------------------------------------------------
2727// sra
2728// ---------------------------------------------------------------------------
2729
2730static SIMD_INLINE Vec<Byte, 16> sra(const Vec<Byte, 16> &a,
2731 const uint8_t count)
2732{
2733 // there is no _mm_sra_epi8 intrinsic
2734 if (count >= 8) {
2735 // result should be all ones if a is negative, all zeros otherwise
2736 return _mm_cmplt_epi8(a, _mm_setzero_si128());
2737 }
2738 __m128i odd = _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2739 __m128i even =
2740 _mm_sra_epi16(_mm_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
2741 return ifelse<Byte>(_mm_set1_epi16((int16_t) 0xFF00), odd, even);
2742}
2743
2744static SIMD_INLINE Vec<SignedByte, 16> sra(const Vec<SignedByte, 16> &a,
2745 const uint8_t count)
2746{
2747 // there is no _mm_sra_epi8 intrinsic
2748 if (count >= 8) {
2749 // result should be all ones if a is negative, all zeros otherwise
2750 return _mm_cmplt_epi8(a, _mm_setzero_si128());
2751 }
2752 __m128i odd = _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2753 __m128i even =
2754 _mm_sra_epi16(_mm_slli_epi16(a, 8), _mm_cvtsi32_si128(count + 8));
2755 return ifelse<SignedByte>(_mm_set1_epi16((int16_t) 0xFF00), odd, even);
2756}
2757
2758static SIMD_INLINE Vec<Word, 16> sra(const Vec<Word, 16> &a,
2759 const uint8_t count)
2760{
2761 return _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2762}
2763
2764static SIMD_INLINE Vec<Short, 16> sra(const Vec<Short, 16> &a,
2765 const uint8_t count)
2766{
2767 return _mm_sra_epi16(a, _mm_cvtsi32_si128(count));
2768}
2769
2770static SIMD_INLINE Vec<Int, 16> sra(const Vec<Int, 16> &a, const uint8_t count)
2771{
2772 return _mm_sra_epi32(a, _mm_cvtsi32_si128(count));
2773}
2774
2775static SIMD_INLINE Vec<Long, 16> sra(const Vec<Long, 16> &a,
2776 const uint8_t count)
2777{
2778 // workaround from Hacker's Delight, 2–17 Double-Length Shifts, Shift right
2779 // double signed:
2780 const __m128i odd = _mm_sra_epi32(a, _mm_cvtsi32_si128(count));
2781 __m128i even;
2782 if (count < 32) {
2783 even = _mm_or_si128(
2784 _mm_srl_epi32(a, _mm_cvtsi32_si128(count)),
2785 _mm_sll_epi32(_mm_srli_si128(a, 4), _mm_cvtsi32_si128(32 - count)));
2786 } else {
2787 even = _mm_sra_epi32(_mm_srli_si128(a, 4), _mm_cvtsi32_si128(count - 32));
2788 }
2789#ifdef __SSE4_1__
2790 return _mm_blend_epi16(even, odd, 0xcc);
2791#else
2792 return _mm_or_si128(_mm_and_si128(even, _mm_set1_epi64x(0x00000000FFFFFFFF)),
2793 _mm_and_si128(odd, _mm_set1_epi64x(0xFFFFFFFF00000000)));
2794#endif
2795}
2796
2797// ---------------------------------------------------------------------------
2798// srl
2799// ---------------------------------------------------------------------------
2800
2801static SIMD_INLINE Vec<Byte, 16> srl(const Vec<Byte, 16> &a,
2802 const uint8_t count)
2803{
2804 return _mm_and_si128(_mm_srl_epi16(a, _mm_cvtsi32_si128(count)),
2805 _mm_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
2806}
2807
2808static SIMD_INLINE Vec<SignedByte, 16> srl(const Vec<SignedByte, 16> &a,
2809 const uint8_t count)
2810{
2811 return _mm_and_si128(_mm_srl_epi16(a, _mm_cvtsi32_si128(count)),
2812 _mm_set1_epi8((int8_t) (uint8_t) (0xff >> count)));
2813}
2814
2815static SIMD_INLINE Vec<Word, 16> srl(const Vec<Word, 16> &a,
2816 const uint8_t count)
2817{
2818 return _mm_srl_epi16(a, _mm_cvtsi32_si128(count));
2819}
2820
2821static SIMD_INLINE Vec<Short, 16> srl(const Vec<Short, 16> &a,
2822 const uint8_t count)
2823{
2824 return _mm_srl_epi16(a, _mm_cvtsi32_si128(count));
2825}
2826
2827static SIMD_INLINE Vec<Int, 16> srl(const Vec<Int, 16> &a, const uint8_t count)
2828{
2829 return _mm_srl_epi32(a, _mm_cvtsi32_si128(count));
2830}
2831
2832static SIMD_INLINE Vec<Long, 16> srl(const Vec<Long, 16> &a,
2833 const uint8_t count)
2834{
2835 return _mm_srl_epi64(a, _mm_cvtsi32_si128(count));
2836}
2837
2838// ---------------------------------------------------------------------------
2839// sll
2840// ---------------------------------------------------------------------------
2841
2842static SIMD_INLINE Vec<Byte, 16> sll(const Vec<Byte, 16> &a,
2843 const uint8_t count)
2844{
2845 return _mm_and_si128(
2846 _mm_sll_epi16(a, _mm_cvtsi32_si128(count)),
2847 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
2848}
2849
2850static SIMD_INLINE Vec<SignedByte, 16> sll(const Vec<SignedByte, 16> &a,
2851 const uint8_t count)
2852{
2853 return _mm_and_si128(
2854 _mm_sll_epi16(a, _mm_cvtsi32_si128(count)),
2855 _mm_set1_epi8((int8_t) (uint8_t) (0xff & (0xff << count))));
2856}
2857
2858static SIMD_INLINE Vec<Word, 16> sll(const Vec<Word, 16> &a,
2859 const uint8_t count)
2860{
2861 return _mm_sll_epi16(a, _mm_cvtsi32_si128(count));
2862}
2863
2864static SIMD_INLINE Vec<Short, 16> sll(const Vec<Short, 16> &a,
2865 const uint8_t count)
2866{
2867 return _mm_sll_epi16(a, _mm_cvtsi32_si128(count));
2868}
2869
2870static SIMD_INLINE Vec<Int, 16> sll(const Vec<Int, 16> &a, const uint8_t count)
2871{
2872 return _mm_sll_epi32(a, _mm_cvtsi32_si128(count));
2873}
2874
2875static SIMD_INLINE Vec<Long, 16> sll(const Vec<Long, 16> &a,
2876 const uint8_t count)
2877{
2878 return _mm_sll_epi64(a, _mm_cvtsi32_si128(count));
2879}
2880
2881// 19. Sep 22 (Jonas Keller):
2882// added Byte and SignedByte versions of hadd, hadds, hsub and hsubs
2883// added Word version of hadds and hsubs
2884
2885// ---------------------------------------------------------------------------
2886// hadd
2887// ---------------------------------------------------------------------------
2888
2889template <typename T>
2890static SIMD_INLINE Vec<T, 16> hadd(const Vec<T, 16> &a, const Vec<T, 16> &b)
2891{
2892 Vec<T, 16> x, y;
2893 unzip(a, b, x, y, Bytes<sizeof(T)>());
2894 return add(x, y);
2895}
2896
2897static SIMD_INLINE Vec<Word, 16> hadd(const Vec<Word, 16> &a,
2898 const Vec<Word, 16> &b)
2899{
2900 return _mm_hadd_epi16(a, b);
2901}
2902
2903static SIMD_INLINE Vec<Short, 16> hadd(const Vec<Short, 16> &a,
2904 const Vec<Short, 16> &b)
2905{
2906 return _mm_hadd_epi16(a, b);
2907}
2908
2909static SIMD_INLINE Vec<Int, 16> hadd(const Vec<Int, 16> &a,
2910 const Vec<Int, 16> &b)
2911{
2912 return _mm_hadd_epi32(a, b);
2913}
2914
2915static SIMD_INLINE Vec<Float, 16> hadd(const Vec<Float, 16> &a,
2916 const Vec<Float, 16> &b)
2917{
2918 return _mm_hadd_ps(a, b);
2919}
2920
2921// _mm_hadd_pd is only available with SSE3, if compiling without SSE3
2922// use template version above
2923#ifdef __SSE3__
2924static SIMD_INLINE Vec<Double, 16> hadd(const Vec<Double, 16> &a,
2925 const Vec<Double, 16> &b)
2926{
2927 return _mm_hadd_pd(a, b);
2928}
2929#endif
2930
2931// ---------------------------------------------------------------------------
2932// hadds
2933// ---------------------------------------------------------------------------
2934
2935// 09. Mar 23 (Jonas Keller): made Int version of hadds saturating
2936
2937template <typename T>
2938static SIMD_INLINE Vec<T, 16> hadds(const Vec<T, 16> &a, const Vec<T, 16> &b)
2939{
2940 Vec<T, 16> x, y;
2941 unzip(a, b, x, y, Bytes<sizeof(T)>());
2942 return adds(x, y);
2943}
2944
2945static SIMD_INLINE Vec<Short, 16> hadds(const Vec<Short, 16> &a,
2946 const Vec<Short, 16> &b)
2947{
2948 return _mm_hadds_epi16(a, b);
2949}
2950
2951// Float not saturated
2952static SIMD_INLINE Vec<Float, 16> hadds(const Vec<Float, 16> &a,
2953 const Vec<Float, 16> &b)
2954{
2955 return _mm_hadd_ps(a, b);
2956}
2957
2958// _mm_hadd_pd is only available with SSE3, if compiling without SSE3
2959// use template version above
2960#ifdef __SSE3__
2961static SIMD_INLINE Vec<Double, 16> hadds(const Vec<Double, 16> &a,
2962 const Vec<Double, 16> &b)
2963{
2964 return _mm_hadd_pd(a, b);
2965}
2966#endif
2967
2968// ---------------------------------------------------------------------------
2969// hsub
2970// ---------------------------------------------------------------------------
2971
2972template <typename T>
2973static SIMD_INLINE Vec<T, 16> hsub(const Vec<T, 16> &a, const Vec<T, 16> &b)
2974{
2975 Vec<T, 16> x, y;
2976 unzip(a, b, x, y, Bytes<sizeof(T)>());
2977 return sub(x, y);
2978}
2979
2980static SIMD_INLINE Vec<Word, 16> hsub(const Vec<Word, 16> &a,
2981 const Vec<Word, 16> &b)
2982{
2983 return _mm_hsub_epi16(a, b);
2984}
2985
2986static SIMD_INLINE Vec<Short, 16> hsub(const Vec<Short, 16> &a,
2987 const Vec<Short, 16> &b)
2988{
2989 return _mm_hsub_epi16(a, b);
2990}
2991
2992static SIMD_INLINE Vec<Int, 16> hsub(const Vec<Int, 16> &a,
2993 const Vec<Int, 16> &b)
2994{
2995 return _mm_hsub_epi32(a, b);
2996}
2997
2998static SIMD_INLINE Vec<Float, 16> hsub(const Vec<Float, 16> &a,
2999 const Vec<Float, 16> &b)
3000{
3001 return _mm_hsub_ps(a, b);
3002}
3003
3004// _mm_hsub_pd is only available with SSE3, if compiling without SSE3
3005// use template version above
3006#ifdef __SSE3__
3007static SIMD_INLINE Vec<Double, 16> hsub(const Vec<Double, 16> &a,
3008 const Vec<Double, 16> &b)
3009{
3010 return _mm_hsub_pd(a, b);
3011}
3012#endif
3013
3014// ---------------------------------------------------------------------------
3015// hsubs
3016// ---------------------------------------------------------------------------
3017
3018// 09. Mar 23 (Jonas Keller): made Int version of hsubs saturating
3019
3020template <typename T>
3021static SIMD_INLINE Vec<T, 16> hsubs(const Vec<T, 16> &a, const Vec<T, 16> &b)
3022{
3023 Vec<T, 16> x, y;
3024 unzip(a, b, x, y, Bytes<sizeof(T)>());
3025 return subs(x, y);
3026}
3027
3028static SIMD_INLINE Vec<Short, 16> hsubs(const Vec<Short, 16> &a,
3029 const Vec<Short, 16> &b)
3030{
3031 return _mm_hsubs_epi16(a, b);
3032}
3033
3034// Float not saturated
3035static SIMD_INLINE Vec<Float, 16> hsubs(const Vec<Float, 16> &a,
3036 const Vec<Float, 16> &b)
3037{
3038 return _mm_hsub_ps(a, b);
3039}
3040
3041// _mm_hsub_pd is only available with SSE3, if compiling without SSE3
3042// use template version above
3043#ifdef __SSE3__
3044static SIMD_INLINE Vec<Double, 16> hsubs(const Vec<Double, 16> &a,
3045 const Vec<Double, 16> &b)
3046{
3047 return _mm_hsub_pd(a, b);
3048}
3049#endif
3050
3051// ---------------------------------------------------------------------------
3052// element-wise shift right
3053// ---------------------------------------------------------------------------
3054
3055template <size_t COUNT, typename T>
3056static SIMD_INLINE Vec<T, 16> srle(const Vec<T, 16> &a)
3057{
3058 const auto intA = reinterpret(a, OutputType<Int>());
3059 const Vec<Int, 16> result =
3060 _mm_srli_si128(intA, vec::min(COUNT * sizeof(T), 16lu));
3061 return reinterpret(result, OutputType<T>());
3062}
3063
3064// ---------------------------------------------------------------------------
3065// element-wise shift left
3066// ---------------------------------------------------------------------------
3067
3068// all integer versions
3069template <size_t COUNT, typename T>
3070static SIMD_INLINE Vec<T, 16> slle(const Vec<T, 16> &a)
3071{
3072 const auto intA = reinterpret(a, OutputType<Int>());
3073 const Vec<Int, 16> result =
3074 _mm_slli_si128(intA, vec::min(COUNT * sizeof(T), 16lu));
3075 return reinterpret(result, OutputType<T>());
3076}
3077
3078// ---------------------------------------------------------------------------
3079// alignre
3080// ---------------------------------------------------------------------------
3081
3082// all integer versions
3083template <size_t COUNT, typename T>
3084static SIMD_INLINE Vec<T, 16> alignre(const Vec<T, 16> &h, const Vec<T, 16> &l)
3085{
3086 SIMD_IF_CONSTEXPR (COUNT * sizeof(T) < 32) {
3087 return _mm_alignr_epi8(h, l, COUNT * sizeof(T));
3088 } else {
3089 return _mm_setzero_si128();
3090 }
3091}
3092
3093// float version
3094template <size_t COUNT>
3095static SIMD_INLINE Vec<Float, 16> alignre(const Vec<Float, 16> &h,
3096 const Vec<Float, 16> &l)
3097{
3098 SIMD_IF_CONSTEXPR (COUNT * sizeof(Float) < 32) {
3099 return _mm_castsi128_ps(_mm_alignr_epi8(
3100 _mm_castps_si128(h), _mm_castps_si128(l), COUNT * sizeof(Float)));
3101 } else {
3102 return _mm_setzero_ps();
3103 }
3104}
3105
3106// double version
3107template <size_t COUNT>
3108static SIMD_INLINE Vec<Double, 16> alignre(const Vec<Double, 16> &h,
3109 const Vec<Double, 16> &l)
3110{
3111 SIMD_IF_CONSTEXPR (COUNT * sizeof(Double) < 32) {
3112 return _mm_castsi128_pd(_mm_alignr_epi8(
3113 _mm_castpd_si128(h), _mm_castpd_si128(l), COUNT * sizeof(Double)));
3114 } else {
3115 return _mm_setzero_pd();
3116 }
3117}
3118
3119// ---------------------------------------------------------------------------
3120// swizzle
3121// ---------------------------------------------------------------------------
3122
3123// swizzle masks (only for 8 and 16 bit element types)
3124
3125// [masks generated from ~/texte/Talks/SSE/swizzle.c]
3126
3127// Byte, SignedByte
3128
3129static SIMD_INLINE __m128i get_swizzle_mask(Integer<2>, Integer<1>)
3130{
3131 return _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
3132}
3133
3134static SIMD_INLINE __m128i get_swizzle_mask(Integer<3>, Integer<1>)
3135{
3136 return _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1);
3137}
3138
3139static SIMD_INLINE __m128i get_swizzle_mask(Integer<4>, Integer<1>)
3140{
3141 return _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
3142}
3143
3144static SIMD_INLINE __m128i get_swizzle_mask(Integer<5>, Integer<1>)
3145{
3146 return _mm_setr_epi8(0, 5, 1, 6, 2, 7, 3, 8, 4, 9, -1, -1, -1, -1, -1, -1);
3147}
3148
3149// Word, Short
3150
3151static SIMD_INLINE __m128i get_swizzle_mask(Integer<2>, Integer<2>)
3152{
3153 return _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
3154}
3155
3156static SIMD_INLINE __m128i get_swizzle_mask(Integer<3>, Integer<2>)
3157{
3158 return _mm_setr_epi8(0, 1, 6, 7, 2, 3, 8, 9, 4, 5, 10, 11, -1, -1, -1, -1);
3159}
3160
3161static SIMD_INLINE __m128i get_swizzle_mask(Integer<4>, Integer<2>)
3162{
3163 return _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
3164}
3165
3166static SIMD_INLINE __m128i get_swizzle_mask(Integer<5>, Integer<2>)
3167{
3168 return _mm_setr_epi8(0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, -1, -1, -1,
3169 -1);
3170}
3171
3172// hub
3173template <size_t N, typename T>
3174static SIMD_INLINE __m128i get_swizzle_mask()
3175{
3176 return get_swizzle_mask(Integer<N>(), Integer<sizeof(T)>());
3177}
3178
3179// ---------- swizzle aux functions -----------
3180
3181// alignoff is the element-wise offset (relates to size of byte)
3182template <size_t ALIGNOFF>
3183static SIMD_INLINE __m128i align_shuffle_128(__m128i lo, __m128i hi,
3184 __m128i mask)
3185{
3186 static_assert(ALIGNOFF >= 0 && ALIGNOFF < 32, "");
3187 return _mm_shuffle_epi8(_mm_alignr_epi8(hi, lo, ALIGNOFF), mask);
3188}
3189
3190// ---------- swizzle (AoS to SoA) ----------
3191
3192// 01. Apr 23 (Jonas Keller): switched from using tag dispatching to using
3193// enable_if SFINAE, which allows more cases with the same implementation
3194// to be combined
3195
3196// -------------------- n = 1 --------------------
3197
3198// all types
3199template <typename T>
3200static SIMD_INLINE void swizzle(Vec<T, 16>[1], Integer<1>)
3201{
3202 // v remains unchanged
3203}
3204
3205// -------------------- n = 2 --------------------
3206
3207// 8 and 16 bit integer types
3208template <typename T,
3209 SIMD_ENABLE_IF(sizeof(T) <= 2 && std::is_integral<T>::value)>
3210static SIMD_INLINE void swizzle(Vec<T, 16> v[2], Integer<2>)
3211{
3212 __m128i s[2];
3213 s[0] = _mm_shuffle_epi8(v[0], get_swizzle_mask<2, T>());
3214 s[1] = _mm_shuffle_epi8(v[1], get_swizzle_mask<2, T>());
3215 v[0] = _mm_unpacklo_epi64(s[0], s[1]);
3216 v[1] = _mm_unpackhi_epi64(s[0], s[1]);
3217}
3218
3219// 32 bit types
3220template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3221static SIMD_INLINE void swizzle(Vec<T, 16> v[2], Integer<2>)
3222{
3223 const __m128 v0tmp = reinterpret(v[0], OutputType<Float>());
3224 const __m128 v1tmp = reinterpret(v[1], OutputType<Float>());
3225 const Vec<Float, 16> v0TmpOut =
3226 _mm_shuffle_ps(v0tmp, v1tmp, _MM_SHUFFLE(2, 0, 2, 0));
3227 const Vec<Float, 16> v1TmpOut =
3228 _mm_shuffle_ps(v0tmp, v1tmp, _MM_SHUFFLE(3, 1, 3, 1));
3229 v[0] = reinterpret(v0TmpOut, OutputType<T>());
3230 v[1] = reinterpret(v1TmpOut, OutputType<T>());
3231}
3232
3233// 64 bit types
3234template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3235 typename = void>
3236static SIMD_INLINE void swizzle(Vec<T, 16> v[2], Integer<2>)
3237{
3238 const __m128d v0tmp = reinterpret(v[0], OutputType<Double>());
3239 const __m128d v1tmp = reinterpret(v[1], OutputType<Double>());
3240 const Vec<Double, 16> v0TmpOut =
3241 _mm_shuffle_pd(v0tmp, v1tmp, _MM_SHUFFLE2(0, 0));
3242 const Vec<Double, 16> v1TmpOut =
3243 _mm_shuffle_pd(v0tmp, v1tmp, _MM_SHUFFLE2(1, 1));
3244 v[0] = reinterpret(v0TmpOut, OutputType<T>());
3245 v[1] = reinterpret(v1TmpOut, OutputType<T>());
3246}
3247
3248// -------------------- n = 3 --------------------
3249
3250// 8 and 16 bit integer types
3251template <typename T,
3252 SIMD_ENABLE_IF(sizeof(T) <= 2 && std::is_integral<T>::value)>
3253static SIMD_INLINE void swizzle(Vec<T, 16> v[3], Integer<3>)
3254{
3255 __m128i mask = get_swizzle_mask<3, T>();
3256 __m128i s0 = align_shuffle_128<0>(v[0], v[1], mask);
3257 __m128i s1 = align_shuffle_128<12>(v[0], v[1], mask);
3258 __m128i s2 = align_shuffle_128<8>(v[1], v[2], mask);
3259 __m128i s3 = align_shuffle_128<4>(v[2], _mm_undefined_si128(), mask);
3260 __m128i l01 = _mm_unpacklo_epi32(s0, s1);
3261 __m128i h01 = _mm_unpackhi_epi32(s0, s1);
3262 __m128i l23 = _mm_unpacklo_epi32(s2, s3);
3263 __m128i h23 = _mm_unpackhi_epi32(s2, s3);
3264 v[0] = _mm_unpacklo_epi64(l01, l23);
3265 v[1] = _mm_unpackhi_epi64(l01, l23);
3266 v[2] = _mm_unpacklo_epi64(h01, h23);
3267}
3268
3269// 32 bit types
3270// from Stan Melax: "3D Vector Normalization..."
3271// https://software.intel.com/en-us/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx
3272template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3273static SIMD_INLINE void swizzle(Vec<T, 16> v[3], Integer<3>)
3274{
3275 const __m128 x0y0z0x1 = reinterpret(v[0], OutputType<Float>());
3276 const __m128 y1z1x2y2 = reinterpret(v[1], OutputType<Float>());
3277 const __m128 z2x3y3z3 = reinterpret(v[2], OutputType<Float>());
3278 const __m128 x2y2x3y3 =
3279 _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
3280 const __m128 y0z0y1z1 =
3281 _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
3282 const Vec<Float, 16> x0x1x2x3 =
3283 _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
3284 const Vec<Float, 16> y0y1y2y3 =
3285 _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
3286 const Vec<Float, 16> z0z1z2z3 =
3287 _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1));
3288 v[0] = reinterpret(x0x1x2x3, OutputType<T>());
3289 v[1] = reinterpret(y0y1y2y3, OutputType<T>());
3290 v[2] = reinterpret(z0z1z2z3, OutputType<T>());
3291}
3292
3293// 64 bit types
3294template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3295 typename = void, typename = void>
3296static SIMD_INLINE void swizzle(Vec<T, 16> v[3], Integer<3>)
3297{
3298 const __m128d x0y0 = reinterpret(v[0], OutputType<Double>());
3299 const __m128d z0x1 = reinterpret(v[1], OutputType<Double>());
3300 const __m128d y1z1 = reinterpret(v[2], OutputType<Double>());
3301 const Vec<Double, 16> x0x1 = _mm_shuffle_pd(x0y0, z0x1, _MM_SHUFFLE2(1, 0));
3302 const Vec<Double, 16> y0y1 = _mm_shuffle_pd(x0y0, y1z1, _MM_SHUFFLE2(0, 1));
3303 const Vec<Double, 16> z0z1 = _mm_shuffle_pd(z0x1, y1z1, _MM_SHUFFLE2(1, 0));
3304 v[0] = reinterpret(x0x1, OutputType<T>());
3305 v[1] = reinterpret(y0y1, OutputType<T>());
3306 v[2] = reinterpret(z0z1, OutputType<T>());
3307}
3308
3309// -------------------- n = 4 --------------------
3310
3311// 8 and 16 bit integer types
3312template <typename T,
3313 SIMD_ENABLE_IF(sizeof(T) <= 2 && std::is_integral<T>::value)>
3314static SIMD_INLINE void swizzle(Vec<T, 16> v[4], Integer<4>)
3315{
3316 __m128i mask = get_swizzle_mask<4, T>();
3317 __m128i s[4];
3318 s[0] = _mm_shuffle_epi8(v[0], mask);
3319 s[1] = _mm_shuffle_epi8(v[1], mask);
3320 s[2] = _mm_shuffle_epi8(v[2], mask);
3321 s[3] = _mm_shuffle_epi8(v[3], mask);
3322 __m128i l01 = _mm_unpacklo_epi32(s[0], s[1]);
3323 __m128i h01 = _mm_unpackhi_epi32(s[0], s[1]);
3324 __m128i l23 = _mm_unpacklo_epi32(s[2], s[3]);
3325 __m128i h23 = _mm_unpackhi_epi32(s[2], s[3]);
3326 v[0] = _mm_unpacklo_epi64(l01, l23);
3327 v[1] = _mm_unpackhi_epi64(l01, l23);
3328 v[2] = _mm_unpacklo_epi64(h01, h23);
3329 v[3] = _mm_unpackhi_epi64(h01, h23);
3330}
3331
3332// 32 bit types
3333template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void>
3334static SIMD_INLINE void swizzle(Vec<T, 16> v[4], Integer<4>)
3335{
3336 __m128 vFloat[4];
3337 for (size_t i = 0; i < 4; ++i) {
3338 vFloat[i] = reinterpret(v[i], OutputType<Float>());
3339 }
3340 __m128 s[4];
3341 s[0] = _mm_shuffle_ps(vFloat[0], vFloat[1], _MM_SHUFFLE(1, 0, 1, 0));
3342 s[1] = _mm_shuffle_ps(vFloat[0], vFloat[1], _MM_SHUFFLE(3, 2, 3, 2));
3343 s[2] = _mm_shuffle_ps(vFloat[2], vFloat[3], _MM_SHUFFLE(1, 0, 1, 0));
3344 s[3] = _mm_shuffle_ps(vFloat[2], vFloat[3], _MM_SHUFFLE(3, 2, 3, 2));
3345 Vec<Float, 16> vOut[4];
3346 vOut[0] = _mm_shuffle_ps(s[0], s[2], _MM_SHUFFLE(2, 0, 2, 0));
3347 vOut[1] = _mm_shuffle_ps(s[0], s[2], _MM_SHUFFLE(3, 1, 3, 1));
3348 vOut[2] = _mm_shuffle_ps(s[1], s[3], _MM_SHUFFLE(2, 0, 2, 0));
3349 vOut[3] = _mm_shuffle_ps(s[1], s[3], _MM_SHUFFLE(3, 1, 3, 1));
3350 for (size_t i = 0; i < 4; ++i) {
3351 v[i] = reinterpret(vOut[i], OutputType<T>());
3352 }
3353}
3354
3355// 64 bit types
3356template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3357 typename = void>
3358static SIMD_INLINE void swizzle(Vec<T, 16> v[4], Integer<4>)
3359{
3360 const __m128d x0y0 = reinterpret(v[0], OutputType<Double>());
3361 const __m128d z0w0 = reinterpret(v[1], OutputType<Double>());
3362 const __m128d x1y1 = reinterpret(v[2], OutputType<Double>());
3363 const __m128d z1w1 = reinterpret(v[3], OutputType<Double>());
3364 const Vec<Double, 16> x0x1 = _mm_unpacklo_pd(x0y0, x1y1);
3365 const Vec<Double, 16> y0y1 = _mm_unpackhi_pd(x0y0, x1y1);
3366 const Vec<Double, 16> z0z1 = _mm_unpacklo_pd(z0w0, z1w1);
3367 const Vec<Double, 16> w0w1 = _mm_unpackhi_pd(z0w0, z1w1);
3368 v[0] = reinterpret(x0x1, OutputType<T>());
3369 v[1] = reinterpret(y0y1, OutputType<T>());
3370 v[2] = reinterpret(z0z1, OutputType<T>());
3371 v[3] = reinterpret(w0w1, OutputType<T>());
3372}
3373
3374// -------------------- n = 5 --------------------
3375
3376// 8 bit integer types
3377template <typename T,
3378 SIMD_ENABLE_IF(sizeof(T) == 1 && std::is_integral<T>::value)>
3379static SIMD_INLINE void swizzle(Vec<T, 16> v[5], Integer<5>)
3380{
3381 __m128i mask = get_swizzle_mask<5, T>();
3382 __m128i s0 = align_shuffle_128<0>(v[0], v[1], mask);
3383 __m128i s1 = align_shuffle_128<10>(v[0], v[1], mask);
3384 __m128i s2 = align_shuffle_128<4>(v[1], v[2], mask);
3385 __m128i s3 = align_shuffle_128<14>(v[1], v[2], mask);
3386 __m128i s4 = align_shuffle_128<8>(v[2], v[3], mask);
3387 __m128i s5 = align_shuffle_128<2>(v[3], _mm_undefined_si128(), mask);
3388 __m128i s6 = align_shuffle_128<12>(v[3], v[4], mask);
3389 __m128i s7 = align_shuffle_128<6>(v[4], _mm_undefined_si128(), mask);
3390 __m128i l01 = _mm_unpacklo_epi16(s0, s1);
3391 __m128i h01 = _mm_unpackhi_epi16(s0, s1);
3392 __m128i l23 = _mm_unpacklo_epi16(s2, s3);
3393 __m128i h23 = _mm_unpackhi_epi16(s2, s3);
3394 __m128i l45 = _mm_unpacklo_epi16(s4, s5);
3395 __m128i h45 = _mm_unpackhi_epi16(s4, s5);
3396 __m128i l67 = _mm_unpacklo_epi16(s6, s7);
3397 __m128i h67 = _mm_unpackhi_epi16(s6, s7);
3398 __m128i ll01l23 = _mm_unpacklo_epi32(l01, l23);
3399 __m128i hl01l23 = _mm_unpackhi_epi32(l01, l23);
3400 __m128i ll45l67 = _mm_unpacklo_epi32(l45, l67);
3401 __m128i hl45l67 = _mm_unpackhi_epi32(l45, l67);
3402 __m128i lh01h23 = _mm_unpacklo_epi32(h01, h23);
3403 __m128i lh45h67 = _mm_unpacklo_epi32(h45, h67);
3404 v[0] = _mm_unpacklo_epi64(ll01l23, ll45l67);
3405 v[1] = _mm_unpackhi_epi64(ll01l23, ll45l67);
3406 v[2] = _mm_unpacklo_epi64(hl01l23, hl45l67);
3407 v[3] = _mm_unpackhi_epi64(hl01l23, hl45l67);
3408 v[4] = _mm_unpacklo_epi64(lh01h23, lh45h67);
3409}
3410
3411// 16 bit integer types
3412template <typename T,
3413 SIMD_ENABLE_IF(sizeof(T) == 2 && std::is_integral<T>::value),
3414 typename = void>
3415static SIMD_INLINE void swizzle(Vec<T, 16> v[5], Integer<5>)
3416{
3417 __m128i mask = get_swizzle_mask<5, T>();
3418 __m128i s0 = align_shuffle_128<0>(v[0], v[1], mask);
3419 __m128i s1 = align_shuffle_128<6>(v[0], v[1], mask);
3420 __m128i s2 = align_shuffle_128<4>(v[1], v[2], mask);
3421 __m128i s3 = align_shuffle_128<10>(v[1], v[2], mask);
3422 __m128i s4 = align_shuffle_128<8>(v[2], v[3], mask);
3423 __m128i s5 = align_shuffle_128<14>(v[2], v[3], mask);
3424 __m128i s6 = align_shuffle_128<12>(v[3], v[4], mask);
3425 __m128i s7 = align_shuffle_128<2>(v[4], _mm_undefined_si128(), mask);
3426 __m128i l02 = _mm_unpacklo_epi32(s0, s2);
3427 __m128i h02 = _mm_unpackhi_epi32(s0, s2);
3428 __m128i l13 = _mm_unpacklo_epi32(s1, s3);
3429 __m128i l46 = _mm_unpacklo_epi32(s4, s6);
3430 __m128i h46 = _mm_unpackhi_epi32(s4, s6);
3431 __m128i l57 = _mm_unpacklo_epi32(s5, s7);
3432 v[0] = _mm_unpacklo_epi64(l02, l46);
3433 v[1] = _mm_unpackhi_epi64(l02, l46);
3434 v[2] = _mm_unpacklo_epi64(h02, h46);
3435 v[3] = _mm_unpacklo_epi64(l13, l57);
3436 v[4] = _mm_unpackhi_epi64(l13, l57);
3437}
3438
3439// 32 bit types
3440template <typename T, SIMD_ENABLE_IF(sizeof(T) == 4), typename = void,
3441 typename = void>
3442static SIMD_INLINE void swizzle(Vec<T, 16> vT[5], Integer<5>)
3443{
3444 __m128i v[5];
3445 for (size_t i = 0; i < 5; i++) {
3446 v[i] = reinterpret(vT[i], OutputType<Int>());
3447 };
3448 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3449 // v[0]: 0 1 2 3
3450 // v[1]: 4 x x x
3451 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3452 // x x x x
3453 // 5 6 7 8
3454 __m128i s2 = _mm_alignr_epi8(v[2], v[1], 4);
3455 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3456 // x x x x
3457 // 9 x x x
3458 __m128i s3 = _mm_alignr_epi8(v[3], v[2], 4);
3459 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3460 // x x x x
3461 // 10 11 12 13
3462 __m128i s4 = _mm_alignr_epi8(v[3], v[2], 8);
3463 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3464 // x x x x
3465 // 14 x x x
3466 __m128i s5 = _mm_alignr_epi8(v[4], v[3], 8);
3467 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3468 // X X X X
3469 // 15 16 17 18
3470 __m128i s6 = _mm_alignr_epi8(v[4], v[3], 12);
3471 // v: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19
3472 // X X X X
3473 // 19 x x x
3474 __m128i s7 = _mm_alignr_epi8(v[0], v[4], 12);
3475 // 0 1 2 3 / 5 6 7 8 -> 0 5 1 6 / 2 7 3 8
3476 __m128i l02 = _mm_unpacklo_epi32(v[0], s2);
3477 __m128i h02 = _mm_unpackhi_epi32(v[0], s2);
3478 // 4 x x x / 9 x x x -> 4 9 x x
3479 __m128i l13 = _mm_unpacklo_epi32(v[1], s3);
3480 // 10 11 12 13 / 15 16 17 18 -> 10 15 11 13 / 12 17 13 18
3481 __m128i l46 = _mm_unpacklo_epi32(s4, s6);
3482 __m128i h46 = _mm_unpackhi_epi32(s4, s6);
3483 // 14 x x x / 19 x x x -> 14 19 x x
3484 __m128i l57 = _mm_unpacklo_epi32(s5, s7);
3485
3486 const Vec<Int, 16> vOut[5] = {
3487 // 0 5 1 6 / 10 15 11 13 -> 0 5 10 15 / 1 6 11 16
3488 _mm_unpacklo_epi64(l02, l46),
3489 _mm_unpackhi_epi64(l02, l46),
3490 // 2 7 3 8 / 12 17 13 18 -> 2 7 12 17 / 3 8 13 18
3491 _mm_unpacklo_epi64(h02, h46),
3492 _mm_unpackhi_epi64(h02, h46),
3493 // 4 9 x x / 14 19 x x -> 4 9 14 19
3494 _mm_unpacklo_epi64(l13, l57),
3495 };
3496 for (size_t i = 0; i < 5; ++i) {
3497 vT[i] = reinterpret(vOut[i], OutputType<T>());
3498 }
3499}
3500
3501// 64 bit types
3502template <typename T, SIMD_ENABLE_IF(sizeof(T) == 8), typename = void,
3503 typename = void, typename = void>
3504static SIMD_INLINE void swizzle(Vec<T, 16> v[5], Integer<5>)
3505{
3506 const __m128d a0b0 = reinterpret(v[0], OutputType<Double>());
3507 const __m128d c0d0 = reinterpret(v[1], OutputType<Double>());
3508 const __m128d e0a1 = reinterpret(v[2], OutputType<Double>());
3509 const __m128d b1c1 = reinterpret(v[3], OutputType<Double>());
3510 const __m128d d1e1 = reinterpret(v[4], OutputType<Double>());
3511 const Vec<Double, 16> a0a1 = _mm_shuffle_pd(a0b0, e0a1, _MM_SHUFFLE2(1, 0));
3512 const Vec<Double, 16> b0b1 = _mm_shuffle_pd(a0b0, b1c1, _MM_SHUFFLE2(0, 1));
3513 const Vec<Double, 16> c0c1 = _mm_shuffle_pd(c0d0, b1c1, _MM_SHUFFLE2(1, 0));
3514 const Vec<Double, 16> d0d1 = _mm_shuffle_pd(c0d0, d1e1, _MM_SHUFFLE2(0, 1));
3515 const Vec<Double, 16> e0e1 = _mm_shuffle_pd(e0a1, d1e1, _MM_SHUFFLE2(1, 0));
3516 v[0] = reinterpret(a0a1, OutputType<T>());
3517 v[1] = reinterpret(b0b1, OutputType<T>());
3518 v[2] = reinterpret(c0c1, OutputType<T>());
3519 v[3] = reinterpret(d0d1, OutputType<T>());
3520 v[4] = reinterpret(e0e1, OutputType<T>());
3521}
3522
3523// ---------------------------------------------------------------------------
3524// compare <
3525// ---------------------------------------------------------------------------
3526
3527// http://stackoverflow.com/questions/16204663/sse-compare-packed-unsigned-bytes
3528static SIMD_INLINE Vec<Byte, 16> cmplt(const Vec<Byte, 16> &a,
3529 const Vec<Byte, 16> &b)
3530{
3531 __m128i signbit = _mm_set1_epi32(0x80808080);
3532 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x80
3533 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x80
3534 return _mm_cmplt_epi8(a1, b1);
3535}
3536
3537static SIMD_INLINE Vec<SignedByte, 16> cmplt(const Vec<SignedByte, 16> &a,
3538 const Vec<SignedByte, 16> &b)
3539{
3540 return _mm_cmplt_epi8(a, b);
3541}
3542
3543static SIMD_INLINE Vec<Word, 16> cmplt(const Vec<Word, 16> &a,
3544 const Vec<Word, 16> &b)
3545{
3546 __m128i signbit = _mm_set1_epi32(0x80008000);
3547 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x8000
3548 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x8000
3549 return _mm_cmplt_epi16(a1, b1);
3550}
3551
3552static SIMD_INLINE Vec<Short, 16> cmplt(const Vec<Short, 16> &a,
3553 const Vec<Short, 16> &b)
3554{
3555 return _mm_cmplt_epi16(a, b);
3556}
3557
3558static SIMD_INLINE Vec<Int, 16> cmplt(const Vec<Int, 16> &a,
3559 const Vec<Int, 16> &b)
3560{
3561 return _mm_cmplt_epi32(a, b);
3562}
3563
3564static SIMD_INLINE Vec<Long, 16> cmplt(const Vec<Long, 16> &a,
3565 const Vec<Long, 16> &b)
3566{
3567 // _mm_cmplt_epi64 does not exist
3568#ifdef __SSE4_2__
3569 return _mm_cmpgt_epi64(b, a);
3570#else
3571 // from Hacker's Delight, 2-12 Comparison Predicates:
3572 const __m128i diff = _mm_sub_epi64(a, b);
3573#if 1 // TODO: check which is faster
3574 const __m128i res = _mm_xor_si128(
3575 diff, _mm_and_si128(_mm_xor_si128(a, b), _mm_xor_si128(diff, a)));
3576#else
3577 const __m128i res = _mm_or_si128(_mm_andnot_si128(b, a),
3578 _mm_andnot_si128(_mm_xor_si128(a, b), diff));
3579#endif
3580 // result in highest bit of res
3581 // spread highest bit to all bits
3582 const __m128i spread32 = _mm_srai_epi32(res, 31);
3583 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3584#endif
3585}
3586
3587static SIMD_INLINE Vec<Float, 16> cmplt(const Vec<Float, 16> &a,
3588 const Vec<Float, 16> &b)
3589{
3590 return _mm_cmplt_ps(a, b);
3591}
3592
3593static SIMD_INLINE Vec<Double, 16> cmplt(const Vec<Double, 16> &a,
3594 const Vec<Double, 16> &b)
3595{
3596 return _mm_cmplt_pd(a, b);
3597}
3598
3599// ---------------------------------------------------------------------------
3600// compare <=
3601// ---------------------------------------------------------------------------
3602
3603// http://stackoverflow.com/questions/16204663/sse-compare-packed-unsigned-bytes
3604static SIMD_INLINE Vec<Byte, 16> cmple(const Vec<Byte, 16> &a,
3605 const Vec<Byte, 16> &b)
3606{
3607 __m128i signbit = _mm_set1_epi32(0x80808080);
3608 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x80
3609 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x80
3610 return _mm_or_si128(_mm_cmplt_epi8(a1, b1), _mm_cmpeq_epi8(a1, b1));
3611}
3612
3613static SIMD_INLINE Vec<SignedByte, 16> cmple(const Vec<SignedByte, 16> &a,
3614 const Vec<SignedByte, 16> &b)
3615{
3616 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3617}
3618
3619static SIMD_INLINE Vec<Word, 16> cmple(const Vec<Word, 16> &a,
3620 const Vec<Word, 16> &b)
3621{
3622 __m128i signbit = _mm_set1_epi32(0x80008000);
3623 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x8000
3624 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x8000
3625 return _mm_or_si128(_mm_cmplt_epi16(a1, b1), _mm_cmpeq_epi16(a1, b1));
3626}
3627
3628static SIMD_INLINE Vec<Short, 16> cmple(const Vec<Short, 16> &a,
3629 const Vec<Short, 16> &b)
3630{
3631 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3632}
3633
3634static SIMD_INLINE Vec<Int, 16> cmple(const Vec<Int, 16> &a,
3635 const Vec<Int, 16> &b)
3636{
3637 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3638}
3639
3640static SIMD_INLINE Vec<Long, 16> cmple(const Vec<Long, 16> &a,
3641 const Vec<Long, 16> &b)
3642{
3643 // _mm_cmplt_epi64 does not exist
3644#ifdef __SSE4_2__
3645 return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
3646#else
3647 // Hacker's Delight, 2-12 Comparison Predicates:
3648 const __m128i res = _mm_and_si128(
3649 _mm_or_si128(a, _mm_xor_si128(b, _mm_set1_epi32(-1))),
3650 _mm_or_si128(_mm_xor_si128(a, b),
3651 _mm_xor_si128(_mm_sub_epi64(b, a), _mm_set1_epi32(-1))));
3652 // result in highest bit of res
3653 // spread highest bit to all bits
3654 const __m128i spread32 = _mm_srai_epi32(res, 31);
3655 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3656#endif
3657}
3658
3659static SIMD_INLINE Vec<Float, 16> cmple(const Vec<Float, 16> &a,
3660 const Vec<Float, 16> &b)
3661{
3662 return _mm_cmple_ps(a, b);
3663}
3664
3665static SIMD_INLINE Vec<Double, 16> cmple(const Vec<Double, 16> &a,
3666 const Vec<Double, 16> &b)
3667{
3668 return _mm_cmple_pd(a, b);
3669}
3670
3671// ---------------------------------------------------------------------------
3672// compare ==
3673// ---------------------------------------------------------------------------
3674
3675static SIMD_INLINE Vec<Byte, 16> cmpeq(const Vec<Byte, 16> &a,
3676 const Vec<Byte, 16> &b)
3677{
3678 return _mm_cmpeq_epi8(a, b);
3679}
3680
3681static SIMD_INLINE Vec<SignedByte, 16> cmpeq(const Vec<SignedByte, 16> &a,
3682 const Vec<SignedByte, 16> &b)
3683{
3684 return _mm_cmpeq_epi8(a, b);
3685}
3686
3687static SIMD_INLINE Vec<Word, 16> cmpeq(const Vec<Word, 16> &a,
3688 const Vec<Word, 16> &b)
3689{
3690 return _mm_cmpeq_epi16(a, b);
3691}
3692
3693static SIMD_INLINE Vec<Short, 16> cmpeq(const Vec<Short, 16> &a,
3694 const Vec<Short, 16> &b)
3695{
3696 return _mm_cmpeq_epi16(a, b);
3697}
3698
3699static SIMD_INLINE Vec<Int, 16> cmpeq(const Vec<Int, 16> &a,
3700 const Vec<Int, 16> &b)
3701{
3702 return _mm_cmpeq_epi32(a, b);
3703}
3704
3705static SIMD_INLINE Vec<Long, 16> cmpeq(const Vec<Long, 16> &a,
3706 const Vec<Long, 16> &b)
3707{
3708#ifdef __SSE_4_1__
3709 return _mm_cmpeq_epi64(a, b);
3710#else
3711 const __m128i res32 = _mm_cmpeq_epi32(a, b);
3712 return _mm_and_si128(res32,
3713 _mm_shuffle_epi32(res32, _MM_SHUFFLE(2, 3, 0, 1)));
3714#endif
3715}
3716
3717static SIMD_INLINE Vec<Float, 16> cmpeq(const Vec<Float, 16> &a,
3718 const Vec<Float, 16> &b)
3719{
3720 return _mm_cmpeq_ps(a, b);
3721}
3722
3723static SIMD_INLINE Vec<Double, 16> cmpeq(const Vec<Double, 16> &a,
3724 const Vec<Double, 16> &b)
3725{
3726 return _mm_cmpeq_pd(a, b);
3727}
3728
3729// ---------------------------------------------------------------------------
3730// compare >
3731// ---------------------------------------------------------------------------
3732
3733// http://stackoverflow.com/questions/16204663/sse-compare-packed-unsigned-bytes
3734static SIMD_INLINE Vec<Byte, 16> cmpgt(const Vec<Byte, 16> &a,
3735 const Vec<Byte, 16> &b)
3736{
3737 __m128i signbit = _mm_set1_epi32(0x80808080);
3738 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x80
3739 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x80
3740 return _mm_cmpgt_epi8(a1, b1);
3741}
3742
3743static SIMD_INLINE Vec<SignedByte, 16> cmpgt(const Vec<SignedByte, 16> &a,
3744 const Vec<SignedByte, 16> &b)
3745{
3746 return _mm_cmpgt_epi8(a, b);
3747}
3748
3749static SIMD_INLINE Vec<Word, 16> cmpgt(const Vec<Word, 16> &a,
3750 const Vec<Word, 16> &b)
3751{
3752 __m128i signbit = _mm_set1_epi32(0x80008000);
3753 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x8000
3754 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x8000
3755 return _mm_cmpgt_epi16(a1, b1);
3756}
3757
3758static SIMD_INLINE Vec<Short, 16> cmpgt(const Vec<Short, 16> &a,
3759 const Vec<Short, 16> &b)
3760{
3761 return _mm_cmpgt_epi16(a, b);
3762}
3763
3764static SIMD_INLINE Vec<Int, 16> cmpgt(const Vec<Int, 16> &a,
3765 const Vec<Int, 16> &b)
3766{
3767 return _mm_cmpgt_epi32(a, b);
3768}
3769
3770static SIMD_INLINE Vec<Long, 16> cmpgt(const Vec<Long, 16> &a,
3771 const Vec<Long, 16> &b)
3772{
3773#ifdef __SSE4_2__
3774 return _mm_cmpgt_epi64(a, b);
3775#else
3776 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
3777 const __m128i diff = _mm_sub_epi64(b, a);
3778#if 1 // TODO: check which is faster
3779 const __m128i res = _mm_xor_si128(
3780 diff, _mm_and_si128(_mm_xor_si128(b, a), _mm_xor_si128(diff, b)));
3781#else
3782 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
3783 _mm_andnot_si128(_mm_xor_si128(b, a), diff));
3784#endif
3785 // result in highest bit of res
3786 // spread highest bit to all bits
3787 const __m128i spread32 = _mm_srai_epi32(res, 31);
3788 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3789#endif
3790}
3791
3792static SIMD_INLINE Vec<Float, 16> cmpgt(const Vec<Float, 16> &a,
3793 const Vec<Float, 16> &b)
3794{
3795 return _mm_cmpgt_ps(a, b);
3796}
3797
3798static SIMD_INLINE Vec<Double, 16> cmpgt(const Vec<Double, 16> &a,
3799 const Vec<Double, 16> &b)
3800{
3801 return _mm_cmpgt_pd(a, b);
3802}
3803
3804// ---------------------------------------------------------------------------
3805// compare >=
3806// ---------------------------------------------------------------------------
3807
3808// http://stackoverflow.com/questions/16204663/sse-compare-packed-unsigned-bytes
3809static SIMD_INLINE Vec<Byte, 16> cmpge(const Vec<Byte, 16> &a,
3810 const Vec<Byte, 16> &b)
3811{
3812 __m128i signbit = _mm_set1_epi32(0x80808080);
3813 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x80
3814 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x80
3815 return _mm_or_si128(_mm_cmpgt_epi8(a1, b1), _mm_cmpeq_epi8(a1, b1));
3816}
3817
3818static SIMD_INLINE Vec<SignedByte, 16> cmpge(const Vec<SignedByte, 16> &a,
3819 const Vec<SignedByte, 16> &b)
3820{
3821 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3822}
3823
3824static SIMD_INLINE Vec<Word, 16> cmpge(const Vec<Word, 16> &a,
3825 const Vec<Word, 16> &b)
3826{
3827 __m128i signbit = _mm_set1_epi32(0x80008000);
3828 __m128i a1 = _mm_xor_si128(a, signbit); // sub 0x8000
3829 __m128i b1 = _mm_xor_si128(b, signbit); // sub 0x8000
3830 return _mm_or_si128(_mm_cmpgt_epi16(a1, b1), _mm_cmpeq_epi16(a1, b1));
3831}
3832
3833static SIMD_INLINE Vec<Short, 16> cmpge(const Vec<Short, 16> &a,
3834 const Vec<Short, 16> &b)
3835{
3836 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3837}
3838
3839static SIMD_INLINE Vec<Int, 16> cmpge(const Vec<Int, 16> &a,
3840 const Vec<Int, 16> &b)
3841{
3842 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3843}
3844
3845static SIMD_INLINE Vec<Long, 16> cmpge(const Vec<Long, 16> &a,
3846 const Vec<Long, 16> &b)
3847{
3848#ifdef __SSE4_2__
3849 return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
3850#else
3851 // Hacker's Delight, 2-12 Comparison Predicates: (swapped le)
3852 const __m128i res = _mm_and_si128(
3853 _mm_or_si128(b, _mm_xor_si128(a, _mm_set1_epi32(-1))),
3854 _mm_or_si128(_mm_xor_si128(b, a),
3855 _mm_xor_si128(_mm_sub_epi64(a, b), _mm_set1_epi32(-1))));
3856 // result in highest bit of res
3857 // spread highest bit to all bits
3858 const __m128i spread32 = _mm_srai_epi32(res, 31);
3859 return _mm_shuffle_epi32(spread32, _MM_SHUFFLE(3, 3, 1, 1));
3860#endif
3861}
3862
3863static SIMD_INLINE Vec<Float, 16> cmpge(const Vec<Float, 16> &a,
3864 const Vec<Float, 16> &b)
3865{
3866 return _mm_cmpge_ps(a, b);
3867}
3868
3869static SIMD_INLINE Vec<Double, 16> cmpge(const Vec<Double, 16> &a,
3870 const Vec<Double, 16> &b)
3871{
3872 return _mm_cmpge_pd(a, b);
3873}
3874
3875// ---------------------------------------------------------------------------
3876// compare !=
3877// ---------------------------------------------------------------------------
3878
3879// there is no cmpneq for integers and no not, so use cmpeq and xor with all
3880// ones to invert the result
3881
3882static SIMD_INLINE Vec<Byte, 16> cmpneq(const Vec<Byte, 16> &a,
3883 const Vec<Byte, 16> &b)
3884{
3885 return _mm_xor_si128(_mm_cmpeq_epi8(a, b), _mm_set1_epi32(-1));
3886}
3887
3888static SIMD_INLINE Vec<SignedByte, 16> cmpneq(const Vec<SignedByte, 16> &a,
3889 const Vec<SignedByte, 16> &b)
3890{
3891 return _mm_xor_si128(_mm_cmpeq_epi8(a, b), _mm_set1_epi32(-1));
3892}
3893
3894static SIMD_INLINE Vec<Word, 16> cmpneq(const Vec<Word, 16> &a,
3895 const Vec<Word, 16> &b)
3896{
3897 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_set1_epi32(-1));
3898}
3899
3900static SIMD_INLINE Vec<Short, 16> cmpneq(const Vec<Short, 16> &a,
3901 const Vec<Short, 16> &b)
3902{
3903 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_set1_epi32(-1));
3904}
3905
3906static SIMD_INLINE Vec<Int, 16> cmpneq(const Vec<Int, 16> &a,
3907 const Vec<Int, 16> &b)
3908{
3909 return _mm_xor_si128(_mm_cmpeq_epi32(a, b), _mm_set1_epi32(-1));
3910}
3911
3912static SIMD_INLINE Vec<Long, 16> cmpneq(const Vec<Long, 16> &a,
3913 const Vec<Long, 16> &b)
3914{
3915#ifdef __SSE4_1__
3916 return _mm_xor_si128(_mm_cmpeq_epi64(a, b), _mm_set1_epi32(-1));
3917#else
3918 const __m128i eq32 = _mm_cmpeq_epi32(a, b);
3919 const __m128i shuffledRes = _mm_shuffle_epi32(eq32, _MM_SHUFFLE(2, 3, 0, 1));
3920 const __m128i eq64 = _mm_and_si128(eq32, shuffledRes);
3921 return _mm_xor_si128(eq64, _mm_set1_epi32(-1));
3922#endif
3923}
3924
3925static SIMD_INLINE Vec<Float, 16> cmpneq(const Vec<Float, 16> &a,
3926 const Vec<Float, 16> &b)
3927{
3928 return _mm_cmpneq_ps(a, b);
3929}
3930
3931static SIMD_INLINE Vec<Double, 16> cmpneq(const Vec<Double, 16> &a,
3932 const Vec<Double, 16> &b)
3933{
3934 return _mm_cmpneq_pd(a, b);
3935}
3936
3937// ---------------------------------------------------------------------------
3938// bit_and
3939// ---------------------------------------------------------------------------
3940
3941// all integer versions
3942template <typename T>
3943static SIMD_INLINE Vec<T, 16> bit_and(const Vec<T, 16> &a, const Vec<T, 16> &b)
3944{
3945 return _mm_and_si128(a, b);
3946}
3947
3948// float version
3949static SIMD_INLINE Vec<Float, 16> bit_and(const Vec<Float, 16> &a,
3950 const Vec<Float, 16> &b)
3951{
3952 return _mm_and_ps(a, b);
3953}
3954
3955// double version
3956static SIMD_INLINE Vec<Double, 16> bit_and(const Vec<Double, 16> &a,
3957 const Vec<Double, 16> &b)
3958{
3959 return _mm_and_pd(a, b);
3960}
3961
3962// ---------------------------------------------------------------------------
3963// bit_or
3964// ---------------------------------------------------------------------------
3965
3966// all integer versions
3967template <typename T>
3968static SIMD_INLINE Vec<T, 16> bit_or(const Vec<T, 16> &a, const Vec<T, 16> &b)
3969{
3970 return _mm_or_si128(a, b);
3971}
3972
3973// float version
3974static SIMD_INLINE Vec<Float, 16> bit_or(const Vec<Float, 16> &a,
3975 const Vec<Float, 16> &b)
3976{
3977 return _mm_or_ps(a, b);
3978}
3979
3980// double version
3981static SIMD_INLINE Vec<Double, 16> bit_or(const Vec<Double, 16> &a,
3982 const Vec<Double, 16> &b)
3983{
3984 return _mm_or_pd(a, b);
3985}
3986
3987// ---------------------------------------------------------------------------
3988// bit_andnot
3989// ---------------------------------------------------------------------------
3990
3991// all integer versions
3992template <typename T>
3993static SIMD_INLINE Vec<T, 16> bit_andnot(const Vec<T, 16> &a,
3994 const Vec<T, 16> &b)
3995{
3996 return _mm_andnot_si128(a, b);
3997}
3998
3999// float version
4000static SIMD_INLINE Vec<Float, 16> bit_andnot(const Vec<Float, 16> &a,
4001 const Vec<Float, 16> &b)
4002{
4003 return _mm_andnot_ps(a, b);
4004}
4005
4006// double version
4007static SIMD_INLINE Vec<Double, 16> bit_andnot(const Vec<Double, 16> &a,
4008 const Vec<Double, 16> &b)
4009{
4010 return _mm_andnot_pd(a, b);
4011}
4012
4013// ---------------------------------------------------------------------------
4014// bit_xor
4015// ---------------------------------------------------------------------------
4016
4017// all integer versions
4018template <typename T>
4019static SIMD_INLINE Vec<T, 16> bit_xor(const Vec<T, 16> &a, const Vec<T, 16> &b)
4020{
4021 return _mm_xor_si128(a, b);
4022}
4023
4024// float version
4025static SIMD_INLINE Vec<Float, 16> bit_xor(const Vec<Float, 16> &a,
4026 const Vec<Float, 16> &b)
4027{
4028 return _mm_xor_ps(a, b);
4029}
4030
4031// double version
4032static SIMD_INLINE Vec<Double, 16> bit_xor(const Vec<Double, 16> &a,
4033 const Vec<Double, 16> &b)
4034{
4035 return _mm_xor_pd(a, b);
4036}
4037
4038// ---------------------------------------------------------------------------
4039// bit_not
4040// ---------------------------------------------------------------------------
4041
4042// all integer versions
4043template <typename T>
4044static SIMD_INLINE Vec<T, 16> bit_not(const Vec<T, 16> &a)
4045{
4046 // there is no _mm_not_si128, so xor with all 1's
4047 return _mm_xor_si128(a, _mm_set1_epi32(-1));
4048}
4049
4050// float version
4051static SIMD_INLINE Vec<Float, 16> bit_not(const Vec<Float, 16> &a)
4052{
4053 // there is no _mm_not_ps, so xor with all 1's
4054 return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
4055}
4056
4057// double version
4058static SIMD_INLINE Vec<Double, 16> bit_not(const Vec<Double, 16> &a)
4059{
4060 // there is no _mm_not_pd, so xor with all 1's
4061 return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1)));
4062}
4063
4064// ---------------------------------------------------------------------------
4065// avg: average with rounding up
4066// ---------------------------------------------------------------------------
4067
4068static SIMD_INLINE Vec<Byte, 16> avg(const Vec<Byte, 16> &a,
4069 const Vec<Byte, 16> &b)
4070{
4071 return _mm_avg_epu8(a, b);
4072}
4073
4074// Paul R at
4075// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4076static SIMD_INLINE Vec<SignedByte, 16> avg(const Vec<SignedByte, 16> &a,
4077 const Vec<SignedByte, 16> &b)
4078{
4079 // from Agner Fog's VCL vectori128.h
4080 __m128i signbit = _mm_set1_epi32(0x80808080);
4081 __m128i a1 = _mm_xor_si128(a, signbit); // add 0x80
4082 __m128i b1 = _mm_xor_si128(b, signbit); // add 0x80
4083 __m128i m1 = _mm_avg_epu8(a1, b1); // unsigned avg
4084 return _mm_xor_si128(m1, signbit); // sub 0x80
4085}
4086
4087static SIMD_INLINE Vec<Word, 16> avg(const Vec<Word, 16> &a,
4088 const Vec<Word, 16> &b)
4089{
4090 return _mm_avg_epu16(a, b);
4091}
4092
4093// Paul R at
4094// http://stackoverflow.com/questions/12152640/signed-16-bit-sse-average
4095static SIMD_INLINE Vec<Short, 16> avg(const Vec<Short, 16> &a,
4096 const Vec<Short, 16> &b)
4097{
4098 // from Agner Fog's VCL vectori128.h
4099 __m128i signbit = _mm_set1_epi32(0x80008000);
4100 __m128i a1 = _mm_xor_si128(a, signbit); // add 0x8000
4101 __m128i b1 = _mm_xor_si128(b, signbit); // add 0x8000
4102 __m128i m1 = _mm_avg_epu16(a1, b1); // unsigned avg
4103 return _mm_xor_si128(m1, signbit); // sub 0x8000
4104}
4105
4106static SIMD_INLINE Vec<Int, 16> avg(const Vec<Int, 16> &a,
4107 const Vec<Int, 16> &b)
4108{
4109 // from Hacker's Delight, 2-5 Average of Two Integers:
4110 return _mm_sub_epi32(_mm_or_si128(a, b),
4111 _mm_srai_epi32(_mm_xor_si128(a, b), 1));
4112}
4113
4114static SIMD_INLINE Vec<Long, 16> avg(const Vec<Long, 16> &a,
4115 const Vec<Long, 16> &b)
4116{
4117 // from Hacker's Delight, 2-5 Average of Two Integers:
4118 return _mm_sub_epi64(_mm_or_si128(a, b),
4119 srai<1>(Vec<Long, 16>(_mm_xor_si128(a, b))));
4120}
4121
4122// NOTE: Float version doesn't round!
4123static SIMD_INLINE Vec<Float, 16> avg(const Vec<Float, 16> &a,
4124 const Vec<Float, 16> &b)
4125{
4126 __m128 half = _mm_set1_ps(0.5f);
4127 return _mm_mul_ps(_mm_add_ps(a, b), half);
4128}
4129
4130// NOTE: Double version doesn't round!
4131static SIMD_INLINE Vec<Double, 16> avg(const Vec<Double, 16> &a,
4132 const Vec<Double, 16> &b)
4133{
4134 __m128d half = _mm_set1_pd(0.5);
4135 return _mm_mul_pd(_mm_add_pd(a, b), half);
4136}
4137
4138// ---------------------------------------------------------------------------
4139// test_all_zeros
4140// ---------------------------------------------------------------------------
4141
4142template <typename T>
4143static SIMD_INLINE bool test_all_zeros(const Vec<T, 16> &a)
4144{
4145 // reinterpret to int in case T is float
4146 const auto intA = reinterpret(a, OutputType<Int>());
4147#ifdef __SSE4_1__
4148 // 10. Oct 22 (Jonas Keller):
4149 // replaced unnecessary "_mm_cmpeq_epi8(a, a)" with "a"
4150 // return _mm_test_all_zeros(a, _mm_cmpeq_epi8(a, a));
4151 return _mm_test_all_zeros(intA, intA);
4152#else
4153 return (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_setzero_si128(), intA)) ==
4154 0xffff);
4155#endif
4156}
4157
4158// ---------------------------------------------------------------------------
4159// test_all_ones
4160// ---------------------------------------------------------------------------
4161
4162template <typename T>
4163static SIMD_INLINE bool test_all_ones(const Vec<T, 16> &a)
4164{
4165 // reinterpret to int in case T is float
4166 const auto intA = reinterpret(a, OutputType<Int>());
4167#ifdef __SSE4_1__
4168 return _mm_test_all_ones(intA);
4169#else
4170 __m128i undef = _mm_undefined_si128();
4171 __m128i ones = _mm_cmpeq_epi8(undef, undef);
4172 return _mm_movemask_epi8(_mm_cmpeq_epi8(ones, intA)) == 0xffff;
4173#endif
4174}
4175
4176// ---------------------------------------------------------------------------
4177// reverse
4178// ---------------------------------------------------------------------------
4179
4180// All reverse operations below are courtesy of Yannick Sander.
4181// modified
4182
4183static SIMD_INLINE Vec<Byte, 16> reverse(const Vec<Byte, 16> &a)
4184{
4185 const __m128i mask =
4186 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4187
4188 // supported by sse3 and higher
4189 // compat available
4190 // no faster instruction available in newer instruction sets
4191 return _mm_shuffle_epi8(a, mask);
4192}
4193
4194static SIMD_INLINE Vec<SignedByte, 16> reverse(const Vec<SignedByte, 16> &a)
4195{
4196 const __m128i mask =
4197 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4198
4199 return _mm_shuffle_epi8(a, mask);
4200}
4201
4202static SIMD_INLINE Vec<Short, 16> reverse(const Vec<Short, 16> &a)
4203{
4204 const __m128i mask =
4205 _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
4206
4207 return _mm_shuffle_epi8(a, mask);
4208}
4209
4210static SIMD_INLINE Vec<Word, 16> reverse(const Vec<Word, 16> &a)
4211{
4212 const __m128i mask =
4213 _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
4214
4215 return _mm_shuffle_epi8(a, mask);
4216}
4217
4218static SIMD_INLINE Vec<Int, 16> reverse(const Vec<Int, 16> &a)
4219{
4220 return _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
4221}
4222
4223static SIMD_INLINE Vec<Long, 16> reverse(const Vec<Long, 16> &a)
4224{
4225 return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
4226}
4227
4228static SIMD_INLINE Vec<Float, 16> reverse(const Vec<Float, 16> &a)
4229{
4230 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3));
4231}
4232
4233static SIMD_INLINE Vec<Double, 16> reverse(const Vec<Double, 16> &a)
4234{
4235 return _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1));
4236}
4237
4238// ---------------------------------------------------------------------------
4239// msb2int
4240// ---------------------------------------------------------------------------
4241
4242// 26. Aug 22 (Jonas Keller): added msb2int functions
4243
4244static SIMD_INLINE uint64_t msb2int(const Vec<Byte, 16> &a)
4245{
4246 return _mm_movemask_epi8(a);
4247}
4248
4249static SIMD_INLINE uint64_t msb2int(const Vec<SignedByte, 16> &a)
4250{
4251 return _mm_movemask_epi8(a);
4252}
4253
4254static SIMD_INLINE uint64_t msb2int(const Vec<Short, 16> &a)
4255{
4256 // move the upper bytes of the 8 shorts to the lower 8 bytes of the vector
4257 // and set the upper 8 bytes of to 0, so that _mm_movemask_epi8
4258 // can be used to extract the upper bit of each short
4259 const __m128i mask =
4260 _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 15, 13, 11, 9, 7, 5, 3, 1);
4261 const __m128i shuffled = _mm_shuffle_epi8(a, mask);
4262 return _mm_movemask_epi8(shuffled);
4263}
4264
4265static SIMD_INLINE uint64_t msb2int(const Vec<Word, 16> &a)
4266{
4267 // move the upper bytes of the 8 words to the lower 8 bytes of the vector
4268 // and set the upper 8 bytes of to 0, so that _mm_movemask_epi8
4269 // can be used to extract the upper bit of each word
4270 const __m128i mask =
4271 _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 15, 13, 11, 9, 7, 5, 3, 1);
4272 const __m128i shuffled = _mm_shuffle_epi8(a, mask);
4273 return _mm_movemask_epi8(shuffled);
4274}
4275
4276static SIMD_INLINE uint64_t msb2int(const Vec<Int, 16> &a)
4277{
4278 return _mm_movemask_ps(_mm_castsi128_ps(a));
4279}
4280
4281static SIMD_INLINE uint64_t msb2int(const Vec<Long, 16> &a)
4282{
4283 return _mm_movemask_pd(_mm_castsi128_pd(a));
4284}
4285
4286static SIMD_INLINE uint64_t msb2int(const Vec<Float, 16> &a)
4287{
4288 return _mm_movemask_ps(a);
4289}
4290
4291static SIMD_INLINE uint64_t msb2int(const Vec<Double, 16> &a)
4292{
4293 return _mm_movemask_pd(a);
4294}
4295
4296// ---------------------------------------------------------------------------
4297// int2msb
4298// ---------------------------------------------------------------------------
4299
4300// 06. Oct 22 (Jonas Keller): added int2msb functions
4301
4302static SIMD_INLINE Vec<Byte, 16> int2msb(const uint64_t a, OutputType<Byte>,
4303 Integer<16>)
4304{
4305 // ssse3 version from https://stackoverflow.com/a/72899629
4306#ifdef __SSSE3__
4307 __m128i shuffleIndeces = _mm_set_epi64x(0x0101010101010101, 0);
4308 __m128i aVec = _mm_shuffle_epi8(_mm_cvtsi32_si128(a), shuffleIndeces);
4309#else
4310 __m128i maskLo = _mm_set_epi64x(0, 0xffffffffffffffff);
4311 __m128i aLo = _mm_and_si128(maskLo, _mm_set1_epi8(a));
4312 __m128i aHi = _mm_andnot_si128(maskLo, _mm_set1_epi8(a >> 8));
4313 __m128i aVec = _mm_or_si128(aLo, aHi);
4314#endif
4315 __m128i sel = _mm_set1_epi64x(0x8040201008040201);
4316 __m128i selected = _mm_and_si128(aVec, sel);
4317 __m128i result = _mm_cmpeq_epi8(selected, sel);
4318 return _mm_and_si128(result, _mm_set1_epi8((int8_t) 0x80));
4319}
4320
4321static SIMD_INLINE Vec<SignedByte, 16> int2msb(const uint64_t a,
4322 OutputType<SignedByte>,
4323 Integer<16>)
4324{
4325 return reinterpret(int2msb(a, OutputType<Byte>(), Integer<16>()),
4326 OutputType<SignedByte>());
4327}
4328
4329static SIMD_INLINE Vec<Short, 16> int2msb(const uint64_t a, OutputType<Short>,
4330 Integer<16>)
4331{
4332 __m128i aVec = _mm_set1_epi16(a);
4333 __m128i sel = _mm_set_epi16(0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004,
4334 0x0002, 0x0001);
4335 __m128i selected = _mm_and_si128(aVec, sel);
4336 __m128i result = _mm_cmpeq_epi16(selected, sel);
4337 return _mm_and_si128(result, _mm_set1_epi16((int16_t) 0x8000));
4338}
4339
4340static SIMD_INLINE Vec<Word, 16> int2msb(const uint64_t a, OutputType<Word>,
4341 Integer<16>)
4342{
4343 return reinterpret(int2msb(a, OutputType<Short>(), Integer<16>()),
4344 OutputType<Word>());
4345}
4346
4347static SIMD_INLINE Vec<Int, 16> int2msb(const uint64_t a, OutputType<Int>,
4348 Integer<16>)
4349{
4350 __m128i aVec = _mm_set1_epi32(a);
4351 __m128i sel = _mm_set_epi32(0x00000008, 0x00000004, 0x00000002, 0x00000001);
4352 __m128i selected = _mm_and_si128(aVec, sel);
4353 __m128i result = _mm_cmpeq_epi32(selected, sel);
4354 return _mm_and_si128(result, _mm_set1_epi32(0x80000000));
4355}
4356
4357static SIMD_INLINE Vec<Long, 16> int2msb(const uint64_t a, OutputType<Long>,
4358 Integer<16>)
4359{
4360 return _mm_set_epi64x((a & 2) ? 0x8000000000000000 : 0,
4361 (a & 1) ? 0x8000000000000000 : 0);
4362}
4363
4364static SIMD_INLINE Vec<Float, 16> int2msb(const uint64_t a, OutputType<Float>,
4365 Integer<16>)
4366{
4367 return reinterpret(int2msb(a, OutputType<Int>(), Integer<16>()),
4368 OutputType<Float>());
4369}
4370
4371static SIMD_INLINE Vec<Double, 16> int2msb(const uint64_t a, OutputType<Double>,
4372 Integer<16>)
4373{
4374 return _mm_set_pd((a & 2) ? -0.0 : 0.0, (a & 1) ? -0.0 : 0.0);
4375}
4376
4377// ---------------------------------------------------------------------------
4378// int2bits
4379// ---------------------------------------------------------------------------
4380
4381// 09. Oct 22 (Jonas Keller): added int2bits functions
4382
4383static SIMD_INLINE Vec<Byte, 16> int2bits(const uint64_t a, OutputType<Byte>,
4384 Integer<16>)
4385{
4386 // ssse3 version from https://stackoverflow.com/a/72899629
4387#ifdef __SSSE3__
4388 __m128i shuffleIndeces = _mm_set_epi64x(0x0101010101010101, 0);
4389 __m128i aVec = _mm_shuffle_epi8(_mm_cvtsi32_si128(a), shuffleIndeces);
4390#else
4391 __m128i maskLo = _mm_set_epi64x(0, 0xffffffffffffffff);
4392 __m128i aLo = _mm_and_si128(maskLo, _mm_set1_epi8(a));
4393 __m128i aHi = _mm_andnot_si128(maskLo, _mm_set1_epi8(a >> 8));
4394 __m128i aVec = _mm_or_si128(aLo, aHi);
4395#endif
4396 __m128i sel = _mm_set1_epi64x(0x8040201008040201);
4397 __m128i selected = _mm_and_si128(aVec, sel);
4398 return _mm_cmpeq_epi8(selected, sel);
4399}
4400
4401static SIMD_INLINE Vec<SignedByte, 16> int2bits(const uint64_t a,
4402 OutputType<SignedByte>,
4403 Integer<16>)
4404{
4405 return reinterpret(int2bits(a, OutputType<Byte>(), Integer<16>()),
4406 OutputType<SignedByte>());
4407}
4408
4409static SIMD_INLINE Vec<Short, 16> int2bits(const uint64_t a, OutputType<Short>,
4410 Integer<16>)
4411{
4412 __m128i aVec = _mm_set1_epi16(a);
4413 __m128i sel = _mm_set_epi16(0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004,
4414 0x0002, 0x0001);
4415 __m128i selected = _mm_and_si128(aVec, sel);
4416 return _mm_cmpeq_epi16(selected, sel);
4417}
4418
4419static SIMD_INLINE Vec<Word, 16> int2bits(const uint64_t a, OutputType<Word>,
4420 Integer<16>)
4421{
4422 return reinterpret(int2bits(a, OutputType<Short>(), Integer<16>()),
4423 OutputType<Word>());
4424}
4425
4426static SIMD_INLINE Vec<Int, 16> int2bits(const uint64_t a, OutputType<Int>,
4427 Integer<16>)
4428{
4429 __m128i aVec = _mm_set1_epi32(a);
4430 __m128i sel = _mm_set_epi32(0x00000008, 0x00000004, 0x00000002, 0x00000001);
4431 __m128i selected = _mm_and_si128(aVec, sel);
4432 return _mm_cmpeq_epi32(selected, sel);
4433}
4434
4435static SIMD_INLINE Vec<Long, 16> int2bits(const uint64_t a, OutputType<Long>,
4436 Integer<16>)
4437{
4438 return _mm_set_epi64x((a & 2) ? -1 : 0, (a & 1) ? -1 : 0);
4439}
4440
4441static SIMD_INLINE Vec<Float, 16> int2bits(const uint64_t a, OutputType<Float>,
4442 Integer<16>)
4443{
4444 return reinterpret(int2bits(a, OutputType<Int>(), Integer<16>()),
4445 OutputType<Float>());
4446}
4447
4448static SIMD_INLINE Vec<Double, 16> int2bits(const uint64_t a,
4449 OutputType<Double>, Integer<16>)
4450{
4451 const auto trueVal = TypeInfo<Double>::trueval();
4452 return _mm_set_pd((a & 2) ? trueVal : 0.0, (a & 1) ? trueVal : 0.0);
4453}
4454
4455// ---------------------------------------------------------------------------
4456// iota
4457// ---------------------------------------------------------------------------
4458
4459// 30. Jan 23 (Jonas Keller): added iota
4460
4461static SIMD_INLINE Vec<Byte, 16> iota(OutputType<Byte>, Integer<16>)
4462{
4463 return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4464}
4465
4466static SIMD_INLINE Vec<SignedByte, 16> iota(OutputType<SignedByte>, Integer<16>)
4467{
4468 return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4469}
4470
4471static SIMD_INLINE Vec<Short, 16> iota(OutputType<Short>, Integer<16>)
4472{
4473 return _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
4474}
4475
4476static SIMD_INLINE Vec<Word, 16> iota(OutputType<Word>, Integer<16>)
4477{
4478 return _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
4479}
4480
4481static SIMD_INLINE Vec<Int, 16> iota(OutputType<Int>, Integer<16>)
4482{
4483 return _mm_set_epi32(3, 2, 1, 0);
4484}
4485
4486static SIMD_INLINE Vec<Long, 16> iota(OutputType<Long>, Integer<16>)
4487{
4488 return _mm_set_epi64x(1, 0);
4489}
4490
4491static SIMD_INLINE Vec<Float, 16> iota(OutputType<Float>, Integer<16>)
4492{
4493 return _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
4494}
4495
4496static SIMD_INLINE Vec<Double, 16> iota(OutputType<Double>, Integer<16>)
4497{
4498 return _mm_set_pd(1.0, 0.0);
4499}
4500
4501} // namespace base
4502} // namespace internal
4503} // namespace simd
4504
4505#endif
4506
4507#endif // SIMD_VEC_BASE_IMPL_INTEL_16_H_
aligned_allocator< Vec< T, SIMD_WIDTH >, SIMD_WIDTH > allocator
Allocator to be used with std::vector.
Definition vec.H:103
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t bytes
Number of bytes in the vector.
Definition vec.H:90
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
void * aligned_malloc(size_t alignment, size_t size)
Aligned memory allocation.
Definition alloc.H:61
void aligned_free(void *ptr)
Aligned memory deallocation.
Definition alloc.H:102
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
int16_t Short
Signed 16-bit integer.
Definition types.H:53
int32_t Int
Signed 32-bit integer.
Definition types.H:54
uint16_t Word
Unsigned 16-bit integer.
Definition types.H:52
int64_t Long
Signed 64-bit integer.
Definition types.H:55
uint8_t Byte
Unsigned 8-bit integer.
Definition types.H:50
double Double
Double-precision floating point number (64-bit)
Definition types.H:57
int8_t SignedByte
Signed 8-bit integer.
Definition types.H:51
Namespace for T-SIMD.
Definition time_measurement.H:161