T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
ext.H
1// ===========================================================================
2//
3// extension commands combining multiple 1st-level vector template functions
4//
5// This source code file is part of the following software:
6//
7// - the low-level C++ template SIMD library
8// - the SIMD implementation of the MinWarping and the 2D-Warping methods
9// for local visual homing.
10//
11// The software is provided based on the accompanying license agreement in the
12// file LICENSE.md.
13// The software is provided "as is" without any warranty by the licensor and
14// without any liability of the licensor, and the software may not be
15// distributed by the licensee; see the license agreement for details.
16//
17// (C) Ralf Möller
18// Computer Engineering
19// Faculty of Technology
20// Bielefeld University
21// www.ti.uni-bielefeld.de
22//
23// ===========================================================================
24
25// 22. Jan 23 (Jonas Keller): moved internal implementations into internal
26// namespace
27
28// 03. Mar 23 (Jonas Keller): removed hsub and hsubs extensions, as they do not
29// do something useful
30
31// 09. Mar 23 (Jonas Keller): added doxygen documentation
32
33// 03. Aug 23 (Jonas Keller): renamed all swizzle2, unswizzle and transpose
34// versions to sequential names <base_name>_a, <base_name>_b, <base_name>_c,
35// etc. and moved them into the internal namespace and added a hub function
36// <base_name> that wraps the fastest version
37
38// 13. May 23 (Jonas Keller): added Double support
39
40#pragma once
41#ifndef SIMD_VEC_EXT_H_
42#define SIMD_VEC_EXT_H_
43
44#include "autogen/ext_transpose.H"
45#include "base.H"
46#include "defs.H"
47#include "types.H"
48#include "vec.H"
49
50#include <cassert>
51#include <cmath>
52#include <cstddef>
53#include <cstdio>
54#include <string>
55#include <type_traits>
56
57namespace simd {
58namespace internal {
59namespace ext {
60// https://stackoverflow.com/questions/23781506/compile-time-computing-of-number-of-bits-needed-to-encode-n-different-states
61template <typename T>
62static constexpr SIMD_INLINE T floorlog2(T x)
63{
64 static_assert(std::is_integral<T>::value, "");
65 return x == 1 ? 0 : 1 + floorlog2(x >> 1);
66}
67} // namespace ext
68} // namespace internal
69
70// determine NATIVE_SIMD_REG_COUNT
71// https://stackoverflow.com/questions/62419256/how-can-i-determine-how-many-avx-registers-my-processor-has
72
73// exclude from doxygen (until endcond)
75
76#ifdef __x86_64__
77#ifdef __AVX512VL__
78#define NATIVE_SIMD_REG_COUNT 32
79#else
80#define NATIVE_SIMD_REG_COUNT 16
81#endif
82#else
83#define NATIVE_SIMD_REG_COUNT 8
84#endif
86
87// ===========================================================================
88// print functions (for tests)
89// ===========================================================================
90
91// 04. Aug 22 (Jonas Keller):
92// removed treatZero(), not needed anymore because of change below
93//
94// // integer types don't have negative zero
95// template <typename T>
96// static SIMD_INLINE T
97// treatZero(T in)
98// {
99// return in;
100// }
101//
102// // Float: map -0.0f to 0.0f
103// static SIMD_INLINE Float
104// treatZero(Float in)
105// {
106// return (in == -0.0f) ? 0.0f : in;
107// }
108
126template <typename T, size_t SIMD_WIDTH>
127static SIMD_INLINE void fprint(FILE *f, const char *format,
128 const Vec<T, SIMD_WIDTH> &vec)
129{
130 // buffer
131 // 19. Jul 16 (rm)
132 const auto elems = Vec<T, SIMD_WIDTH>::elems; // SIMD_WIDTH/sizeof(T)
133 // T buf[SIMD_WIDTH];
134 T buf[elems];
135 // store vector (unaligned, not time-critical)
136 storeu(buf, vec);
137 // print elements of vector to f
138 for (size_t i = 0; i < elems; i++)
139 // 04. Aug 22 (Jonas Keller):
140 // removed mapping from -0.0f to 0.0f,
141 // for debugging you want to see -0.0f
142 // fprintf(f, format, treatZero(buf[i]));
143 fprintf(f, format, buf[i]);
144}
145
156template <typename T, size_t SIMD_WIDTH>
157static SIMD_INLINE void print(const char *format, const Vec<T, SIMD_WIDTH> &vec)
158{
159 fprint(stdout, format, vec);
160}
161
176template <typename T, size_t SIMD_WIDTH>
177static SIMD_INLINE void fprint(FILE *f, const char *format,
178 const char *separator,
179 const Vec<T, SIMD_WIDTH> &vec)
180{
181 // 09. Jan 23 (Jonas Keller): used std::string instead of strcpy and strcat
182 // to avoid potential buffer overflows
183 // char fmtSep[256];
184 // strcat(strcpy(fmtSep, format), separator);
185 std::string fmtSep = std::string(format) + std::string(separator);
186 fprint(f, fmtSep.c_str(), vec);
187}
188
202template <typename T, size_t SIMD_WIDTH>
203static SIMD_INLINE void print(const char *format, const char *separator,
204 const Vec<T, SIMD_WIDTH> &vec)
205{
206 fprint(stdout, format, separator, vec);
207}
208
211// ===========================================================================
212// multi-vector store and load
213// ===========================================================================
214
225template <typename T, size_t SIMD_WIDTH>
226static SIMD_INLINE void load(const T *const p, Vec<T, SIMD_WIDTH> inVecs[],
227 size_t numInVecs)
228{
229 for (size_t i = 0; i < numInVecs; i++) {
230 inVecs[i] = load<SIMD_WIDTH>(&p[i * Vec<T, SIMD_WIDTH>::elems]);
231 }
232}
233
245template <typename T, size_t SIMD_WIDTH>
246static SIMD_INLINE void loadu(const T *const p, Vec<T, SIMD_WIDTH> inVecs[],
247 size_t numInVecs)
248{
249 for (size_t i = 0; i < numInVecs; i++) {
250 inVecs[i] = loadu<SIMD_WIDTH>(&p[i * Vec<T, SIMD_WIDTH>::elems]);
251 }
252}
253
264template <typename T, size_t SIMD_WIDTH>
265static SIMD_INLINE void store(T *const p, const Vec<T, SIMD_WIDTH> outVecs[],
266 size_t numOutVecs)
267{
268 for (size_t i = 0; i < numOutVecs; i++) {
269 store(&p[i * Vec<T, SIMD_WIDTH>::elems], outVecs[i]);
270 }
271}
272
284template <typename T, size_t SIMD_WIDTH>
285static SIMD_INLINE void storeu(T *const p, const Vec<T, SIMD_WIDTH> outVecs[],
286 size_t numOutVecs)
287{
288 for (size_t i = 0; i < numOutVecs; i++) {
289 storeu(&p[i * Vec<T, SIMD_WIDTH>::elems], outVecs[i]);
290 }
291}
292
303template <typename T, size_t SIMD_WIDTH>
304static SIMD_INLINE void store(T *const p, const Vec<T, SIMD_WIDTH> &outVec,
305 size_t numOutVecs)
306{
307 for (size_t i = 0; i < numOutVecs; i++) {
308 store(&p[i * Vec<T, SIMD_WIDTH>::elems], outVec);
309 }
310}
311
324template <typename T, size_t SIMD_WIDTH>
325static SIMD_INLINE void storeu(T *const p, const Vec<T, SIMD_WIDTH> &outVec,
326 size_t numOutVecs)
327{
328 for (size_t i = 0; i < numOutVecs; i++) {
329 storeu(&p[i * Vec<T, SIMD_WIDTH>::elems], outVec);
330 }
331}
332
333// -------------------- different store functions ----------------------------
334namespace internal {
335namespace ext {
336template <typename T, size_t SIMD_WIDTH>
337struct Store
338{
339 static SIMD_INLINE void _store(T *const p, const Vec<T, SIMD_WIDTH> &outVec)
340 {
341 return store(p, outVec);
342 }
343};
344
345template <typename T, size_t SIMD_WIDTH>
346struct StoreU
347{
348 static SIMD_INLINE void _store(T *const p, const Vec<T, SIMD_WIDTH> &outVec)
349 {
350 return storeu(p, outVec);
351 }
352};
353
354// ---------------------------------------------------------------------------
355// Meta Template Class Store16
356// used to store matrix after transposed with Transpose<Unpack16>
357//
358// TODO: Currently storing complete quadratic matrix. Integrate numOutVecs?
359//
360// gcc error with inline template function: inlining failed in call to
361// always_inline ‘void storeu16(..) [..]’: recursive inlining
362// ---------------------------------------------------------------------------
363
364template <template <typename, size_t> class Store, typename T,
365 size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW, size_t STORE_STOP,
366 size_t STORE_WIDTH, size_t SRC_OFF, size_t DST_OFF>
367struct Store16
368{
369 static SIMD_INLINE void _store16(
370 T *const p, const Vec<T, SIMD_WIDTH> outVecs[Vec<T, SIMD_WIDTH>::elems])
371 {
372 // printf("STORE_WIDTH=%d, SRC_OFFSET=%d, DST_OFFSET=%d\n",
373 // STORE_WIDTH, SRC_OFFSET, DST_OFFSET);
374 Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW, STORE_STOP, STORE_WIDTH / 2,
375 SRC_OFF, 2 * DST_OFF>::_store16(p, outVecs);
376 Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW, STORE_STOP, STORE_WIDTH / 2,
377 SRC_OFF + SIMD_WIDTH / STORE_WIDTH,
378 2 * DST_OFF + STORE_STOP>::_store16(p, outVecs);
379 }
380};
381
382template <template <typename, size_t> class Store, typename T,
383 size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW, size_t STORE_STOP,
384 size_t SRC_OFF, size_t DST_OFF>
385struct Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW, STORE_STOP, 16, SRC_OFF,
386 DST_OFF>
387{
388 static constexpr auto STEP = SIMD_WIDTH / 16;
389 static constexpr auto VO = SRC_OFF + ROW * STEP;
390 static constexpr auto OFF = (DST_OFF + ROW) * NUMROWS;
391
392 static SIMD_INLINE void _store16(
393 T *const p, const Vec<T, SIMD_WIDTH> outVecs[Vec<T, SIMD_WIDTH>::elems])
394 {
395 // printf("VO=%d\n", OFF=%d\n", VO, OFF);
396 Store<T, SIMD_WIDTH>::_store(p + OFF, outVecs[VO]);
397 Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW + 1, STORE_STOP, 16, SRC_OFF,
398 DST_OFF>::_store16(p, outVecs);
399 }
400};
401
402template <template <typename, size_t> class Store, typename T,
403 size_t SIMD_WIDTH, size_t NUMROWS, size_t STORE_STOP, size_t SRC_OFF,
404 size_t DST_OFF>
405struct Store16<Store, T, SIMD_WIDTH, NUMROWS, STORE_STOP, STORE_STOP, 16,
406 SRC_OFF, DST_OFF>
407{
408 static SIMD_INLINE void _store16(
409 T *const, const Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems])
410 {}
411};
412
413// -------------------- store16 functions ------------------------------------
414
415template <typename T, size_t SIMD_WIDTH>
416static SIMD_INLINE void store16(
417 T *const p, const Vec<T, SIMD_WIDTH> outVecs[Vec<T, SIMD_WIDTH>::elems])
418{
419 const auto numRows = SIMD_WIDTH / sizeof(T);
420 const auto storeStop = 16 / sizeof(T);
421 internal::ext::Store16<internal::ext::Store, T, SIMD_WIDTH, numRows, 0,
422 storeStop, SIMD_WIDTH, 0, 0>::_store16(p, outVecs);
423}
424
425template <typename T, size_t SIMD_WIDTH>
426static SIMD_INLINE void storeu16(
427 T *const p, const Vec<T, SIMD_WIDTH> outVecs[Vec<T, SIMD_WIDTH>::elems])
428{
429 const auto numRows = SIMD_WIDTH / sizeof(T);
430 const auto storeStop = 16 / sizeof(T);
431 internal::ext::Store16<internal::ext::StoreU, T, SIMD_WIDTH, numRows, 0,
432 storeStop, SIMD_WIDTH, 0, 0>::_store16(p, outVecs);
433}
434
435} // namespace ext
436} // namespace internal
437
438// ===========================================================================
439// copy (load and store)
440// ===========================================================================
441
456template <size_t SIMD_WIDTH, typename T>
457static SIMD_INLINE void load_store(const T *const src, T *const dst)
458{
460 store(dst, copy);
461}
462
473template <size_t SIMD_WIDTH, typename T>
474static SIMD_INLINE void loadu_store(const T *const src, T *const dst)
475{
477 store(dst, copy);
478}
479
490template <size_t SIMD_WIDTH, typename T>
491static SIMD_INLINE void load_storeu(const T *const src, T *const dst)
492{
494 storeu(dst, copy);
495}
496
506template <size_t SIMD_WIDTH, typename T>
507static SIMD_INLINE void loadu_storeu(const T *const src, T *const dst)
508{
510 storeu(dst, copy);
511}
512
515// ===========================================================================
516// generalized packs
517// ===========================================================================
518
519// input is only signed
520// same-size input and output is allowed
521
522namespace internal {
523namespace ext {
524
525// no stage
526
527template <typename T, size_t SIMD_WIDTH>
528static SIMD_INLINE Vec<T, SIMD_WIDTH> packs(const Vec<T, SIMD_WIDTH> a[1],
529 OutputType<T>, Compression<1>)
530{
531 return a[0];
532}
533
534// single stage
535
536template <typename Tout, typename Tin, size_t SIMD_WIDTH>
537static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(const Vec<Tin, SIMD_WIDTH> a[2],
538 OutputType<Tout>, Compression<2>)
539{
540 return packs<Tout>(a[0], a[1]);
541}
542
543// two stages
544
545// via Short if Tout is Byte or SignedByte
546template <typename Tout, typename Tin, size_t SIMD_WIDTH,
547 SIMD_ENABLE_IF(sizeof(Tout) == 1)>
548static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(const Vec<Tin, SIMD_WIDTH> a[4],
549 OutputType<Tout>, Compression<4>)
550{
551 return packs<Tout>(packs<Short>(a[0], a[1]), packs<Short>(a[2], a[3]));
552}
553
554// via Int if Tout is Word or Short
555template <typename Tout, typename Tin, size_t SIMD_WIDTH,
556 SIMD_ENABLE_IF(sizeof(Tout) == 2), typename = void>
557static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(const Vec<Tin, SIMD_WIDTH> a[4],
558 OutputType<Tout>, Compression<4>)
559{
560 return packs<Tout>(packs<Int>(a[0], a[1]), packs<Int>(a[2], a[3]));
561}
562
563// two stages from Double
564
565template <typename Tout, size_t SIMD_WIDTH>
566static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
567 const Vec<Double, SIMD_WIDTH> a[4], OutputType<Tout>, Compression<4>)
568{
569 // always via Int
570 return packs<Tout>(packs<Int>(a[0], a[1]), packs<Int>(a[2], a[3]));
571}
572
573// three stages
574
575template <typename Tout, typename Tin, size_t SIMD_WIDTH>
576static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(const Vec<Tin, SIMD_WIDTH> a[8],
577 OutputType<Tout>, Compression<8>)
578{
579 // always via Short
580 return packs<Tout>(packs<Short>(a), packs<Short>(a + 4));
581}
582
583// special cases: int <-> float, long <-> double
584
585template <typename Tout, typename Tin, size_t SIMD_WIDTH>
586static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(const Vec<Tin, SIMD_WIDTH> a[1],
587 OutputType<Tout>, Compression<1>)
588{
589 static_assert(sizeof(Tin) == sizeof(Tout), "");
590 static_assert(std::is_floating_point<Tin>::value !=
591 std::is_floating_point<Tout>::value,
592 "");
593 return cvts<Tout>(a[0]);
594}
595
596} // namespace ext
597} // namespace internal
598
599// generalized version of packs: includes multistage packing
600
620template <typename Tout, typename Tin, size_t SIMD_WIDTH>
621static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
622 const Vec<Tin, SIMD_WIDTH> a[sizeof(Tin) / sizeof(Tout)])
623{
624 return internal::ext::packs(
625 a, internal::OutputType<Tout>(),
626 internal::Compression<sizeof(Tin) / sizeof(Tout)>());
627}
628
629// ===========================================================================
630// generalized convert (using extend and packs)
631// ===========================================================================
632
633namespace internal {
634namespace ext {
635template <typename Tout, typename Tin, size_t SIMD_WIDTH,
636 SIMD_ENABLE_IF(sizeof(Tout) < sizeof(Tin))>
637static SIMD_INLINE void convert(
638 const Vec<Tin, SIMD_WIDTH> inVecs[sizeof(Tin) / sizeof(Tout)],
639 Vec<Tout, SIMD_WIDTH> outVecs[1])
640{
641 outVecs[0] = packs<Tout>(inVecs);
642}
643
644template <typename Tout, typename Tin, size_t SIMD_WIDTH,
645 SIMD_ENABLE_IF(sizeof(Tout) >= sizeof(Tin)), typename = void>
646static SIMD_INLINE void convert(const Vec<Tin, SIMD_WIDTH> inVecs[1],
647 Vec<Tout, SIMD_WIDTH> outVecs[1])
648{
649 extend(inVecs[0], outVecs);
650}
651} // namespace ext
652} // namespace internal
653
675template <typename Tout, typename Tin, size_t SIMD_WIDTH>
676static SIMD_INLINE void convert(
679{
680 internal::ext::convert(inVecs, outVecs);
681}
682
683// ===========================================================================
684// float-based operations on arbitrary input and output types
685// ===========================================================================
686
687// internal helper functions for float-based operations:
688namespace internal {
689namespace ext {
690template <typename Tin, typename Tout, typename Tfloat>
691static constexpr SIMD_INLINE size_t numCalcVecs()
692{
693 static_assert(numInVecs<Tout, Tin>() * sizeof(Tfloat) / sizeof(Tin) ==
694 numOutVecs<Tout, Tin>() * sizeof(Tfloat) / sizeof(Tout),
695 "numCalcVecs() must be equal for input and output");
696 return numInVecs<Tout, Tin>() * sizeof(Tfloat) / sizeof(Tin);
697}
698
699template <typename Tin, typename Tout, typename Tfloat, size_t SIMD_WIDTH>
700static SIMD_INLINE void extendInToFloat(
701 const Vec<Tin, SIMD_WIDTH> inVecs[numInVecs<Tout, Tin>()],
702 Vec<Tfloat, SIMD_WIDTH> floatVecs[numCalcVecs<Tout, Tin, Tfloat>()])
703{
704 for (size_t i = 0; i < numInVecs<Tout, Tin>(); ++i) {
705 extend(inVecs[i], &floatVecs[i * sizeof(Tfloat) / sizeof(Tin)]);
706 }
707}
708
709template <typename Tin, typename Tout, typename Tfloat, size_t SIMD_WIDTH>
710static SIMD_INLINE void packsOutFromFloat(
711 const Vec<Tfloat, SIMD_WIDTH> floatVecs[numCalcVecs<Tout, Tin, Tfloat>()],
712 Vec<Tout, SIMD_WIDTH> outVecs[numOutVecs<Tout, Tin>()])
713{
714 for (size_t i = 0; i < numOutVecs<Tout, Tin>(); ++i) {
715 outVecs[i] = packs<Tout>(&floatVecs[i * sizeof(Tfloat) / sizeof(Tout)]);
716 }
717}
718} // namespace ext
719} // namespace internal
720
726// ---------------------------------------------------------------------------
727// divide then multiply with float constant in float arithmetic
728// ---------------------------------------------------------------------------
729
730// TODO: fdivmul: better fmuldiv = first multiply then divide?
731
732// 15. Mar 23 (Jonas Keller): fused the three cases
733
746template <typename Tout, typename Tin,
747 typename Tfloat = BigEnoughFloat<Tout, Tin>, size_t SIMD_WIDTH>
748static SIMD_INLINE void fdivmul(
753{
754 static_assert(sizeof(Tin) <= sizeof(Tfloat), "");
755 static_assert(sizeof(Tout) <= sizeof(Tfloat), "");
756 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
757 const auto facVec = set1<Tfloat, SIMD_WIDTH>(fac);
758 Vec<Tfloat, SIMD_WIDTH> numF[nFloatVecs], denomF[nFloatVecs],
759 resF[nFloatVecs];
760 internal::ext::extendInToFloat<Tin, Tout>(vecsNum, numF);
761 internal::ext::extendInToFloat<Tin, Tout>(vecsDenom, denomF);
762 for (size_t i = 0; i < nFloatVecs; i++) {
763 resF[i] = mul(div(numF[i], denomF[i]), facVec);
764 }
765 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
766}
767
768// ---------------------------------------------------------------------------
769// divide, apply multidimensional sigmoid and then multiply with float
770// constant in float arithmetic (derived from fdivmul)
771// sigmoid(x) = ((y(x,a)/(1-y(x,a)**4)**0.25))+1)/2)
772// y(x,a) = sum_d(a*w[d]*(x[d]-w0[d]))
773// a = -0.433 from fitting this to 1/(1+exp(y(x,1))
774// ---------------------------------------------------------------------------
775
776namespace internal {
777namespace ext {
778template <size_t DIM, size_t NVEC, typename Tout, typename Tin,
779 size_t SIMD_WIDTH, SIMD_ENABLE_IF(sizeof(Tout) < sizeof(Tin))>
780static SIMD_INLINE void fdivMsigmoidmul(
781 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
782 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC], const double w[DIM],
783 const double w0[DIM], double fac, Vec<Tout, SIMD_WIDTH> vecsOut[1])
784{
785 const auto nIn = sizeof(Tin) / sizeof(Tout);
786 const auto fanIn = sizeof(Float) / sizeof(Tin);
787 const auto facF = set1<Float, SIMD_WIDTH>(fac / 2.0);
788 const auto oneF = set1<Float, SIMD_WIDTH>(1.0f);
789 Vec<Float, SIMD_WIDTH> wF[DIM], w0F[DIM], numF[DIM][fanIn],
790 denomF[DIM][fanIn], resF[nIn * fanIn];
791 for (size_t d = 0; d < DIM; d++) {
792 wF[d] = set1<Float, SIMD_WIDTH>(-0.433 * w[d]);
793 w0F[d] = set1<Float, SIMD_WIDTH>(w0[d]);
794 }
795 // i: index of input vector
796 // j: index of extended input vector
797 // k: index of output vectors
798 // TODO: sometimes i < nIn does not work with -O2 is always true?
799 for (size_t i = 0, k = 0; i < nIn; i++) {
800 for (size_t d = 0; d < DIM; d++) {
801 extend(vecsNum[d][i], numF[d]);
802 extend(vecsDenom[d][i], denomF[d]);
803 }
804 for (size_t j = 0; j < fanIn; j++, k++) {
805 auto yF = setzero<Float, SIMD_WIDTH>();
806 for (size_t d = 0; d < DIM; d++) {
807 yF = add(yF, mul(wF[d], sub(div(numF[d][j], denomF[d][j]), w0F[d])));
808 }
809 auto y4F = mul(yF, yF);
810 y4F = mul(y4F, y4F);
811 resF[k] = mul(add(div(yF, sqrt(sqrt(add(oneF, y4F)))), oneF), facF);
812 }
813 }
814 vecsOut[0] = packs<Tout>(resF);
815}
816
817template <size_t DIM, size_t NVEC, typename Tout, typename Tin,
818 size_t SIMD_WIDTH, SIMD_ENABLE_IF(sizeof(Tout) > sizeof(Tin)),
819 typename = void>
820static SIMD_INLINE void fdivMsigmoidmul(
821 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
822 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC], const double w[DIM],
823 const double w0[DIM], double fac,
824 Vec<Tout, SIMD_WIDTH> vecsOut[sizeof(Tout) / sizeof(Tin)])
825{
826 const auto nOut = sizeof(Tout) / sizeof(Tin);
827 const auto fanOut = sizeof(Float) / sizeof(Tout);
828 const auto facF = set1<Float, SIMD_WIDTH>(fac / 2.0);
829 const auto oneF = set1<Float, SIMD_WIDTH>(1.0f);
830 Vec<Float, SIMD_WIDTH> wF[DIM], w0F[DIM], numF[DIM][nOut * fanOut],
831 denomF[DIM][nOut * fanOut], resF[fanOut];
832 for (size_t d = 0; d < DIM; d++) {
833 wF[d] = set1<Float, SIMD_WIDTH>(-0.433 * w[d]);
834 w0F[d] = set1<Float, SIMD_WIDTH>(w0[d]);
835 extend(*vecsNum[d], numF[d]);
836 extend(*vecsDenom[d], denomF[d]);
837 }
838 // i: index of output vector
839 // j: index of partial output vectors
840 // k: index of input vector
841 for (size_t i = 0, k = 0; i < nOut; i++) {
842 for (size_t j = 0; j < fanOut; j++, k++) {
843 auto yF = setzero<Float, SIMD_WIDTH>();
844 for (size_t d = 0; d < DIM; d++) {
845 yF = add(yF, mul(wF[d], sub(div(numF[d][k], denomF[d][k]), w0F[d])));
846 }
847 auto y4F = mul(yF, yF);
848 y4F = mul(y4F, y4F);
849 resF[j] = mul(add(div(yF, sqrt(sqrt(add(oneF, y4F)))), oneF), facF);
850 }
851 vecsOut[i] = packs<Tout>(resF);
852 }
853}
854
855template <size_t DIM, size_t NVEC, typename Tout, typename Tin,
856 size_t SIMD_WIDTH, SIMD_ENABLE_IF(sizeof(Tout) == sizeof(Tin)),
857 typename = void, typename = void>
858static SIMD_INLINE void fdivMsigmoidmul(
859 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
860 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC], const double w[DIM],
861 const double w0[DIM], double fac, Vec<Tout, SIMD_WIDTH> vecsOut[1])
862{
863 const auto fanInOut = sizeof(Float) / sizeof(Tin);
864 const auto facF = set1<Float, SIMD_WIDTH>(fac / 2.0);
865 const auto oneF = set1<Float, SIMD_WIDTH>(1.0f);
866 Vec<Float, SIMD_WIDTH> wF[DIM], w0F[DIM], numF[DIM][fanInOut],
867 denomF[DIM][fanInOut], resF[fanInOut];
868 for (size_t d = 0; d < DIM; d++) {
869 wF[d] = set1<Float, SIMD_WIDTH>(-0.433 * w[d]);
870 w0F[d] = set1<Float, SIMD_WIDTH>(w0[d]);
871 extend(*vecsNum[d], numF[d]);
872 extend(*vecsDenom[d], denomF[d]);
873 }
874 // j: index of extended input/output vector
875 for (size_t j = 0; j < fanInOut; j++) {
876 auto yF = setzero<Float, SIMD_WIDTH>();
877 for (size_t d = 0; d < DIM; d++) {
878 yF = add(yF, mul(wF[d], sub(div(numF[d][j], denomF[d][j]), w0F[d])));
879 }
880 auto y4F = mul(yF, yF);
881 y4F = mul(y4F, y4F);
882 resF[j] = mul(add(div(yF, sqrt(sqrt(add(oneF, y4F)))), oneF), facF);
883 }
884 vecsOut[0] = packs<Tout>(resF);
885}
886} // namespace ext
887} // namespace internal
888
899template <size_t DIM, size_t NVEC, typename Tout, typename Tin,
900 size_t SIMD_WIDTH>
901static SIMD_INLINE void fdivMsigmoidmul(
902 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
903 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC], const double w[DIM],
904 const double w0[DIM], double fac,
906{
907 static_assert(sizeof(Tin) <= sizeof(Float), "");
908 static_assert(sizeof(Tout) <= sizeof(Float), "");
909 internal::ext::fdivMsigmoidmul<DIM, NVEC>(vecsNum, vecsDenom, w, w0, fac,
910 vecsOut);
911}
912
913// ---------------------------------------------------------------------------
914// multiply with float constant in float arithmetic
915// ---------------------------------------------------------------------------
916
917// 15. Mar 23 (Jonas Keller): fused the three cases
918
929template <typename Tout, typename Tin,
930 typename Tfloat = BigEnoughFloat<Tout, Tin>, size_t SIMD_WIDTH>
931static SIMD_INLINE void fmul(
935{
936 static_assert(sizeof(Tin) <= sizeof(Tfloat), "");
937 static_assert(sizeof(Tout) <= sizeof(Tfloat), "");
938 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
939 const auto facVec = set1<Tfloat, SIMD_WIDTH>(fac);
940 Vec<Tfloat, SIMD_WIDTH> inF[nFloatVecs], resF[nFloatVecs];
941 internal::ext::extendInToFloat<Tin, Tout>(vecsIn, inF);
942 for (size_t i = 0; i < nFloatVecs; i++) { resF[i] = mul(inF[i], facVec); }
943 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
944}
945
946// ---------------------------------------------------------------------------
947// add then multiply with float constant in float arithmetic
948// ---------------------------------------------------------------------------
949
950// 15. Mar 23 (Jonas Keller): fused the three cases
951
963template <typename Tout, typename Tin,
964 typename Tfloat = BigEnoughFloat<Tout, Tin>, size_t SIMD_WIDTH>
965static SIMD_INLINE void faddmul(
969{
970 static_assert(sizeof(Tin) <= sizeof(Tfloat), "");
971 static_assert(sizeof(Tout) <= sizeof(Tfloat), "");
972 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
973 const auto offVec = set1<Tfloat, SIMD_WIDTH>(off);
974 const auto facVec = set1<Tfloat, SIMD_WIDTH>(fac);
975 Vec<Tfloat, SIMD_WIDTH> inF[nFloatVecs], resF[nFloatVecs];
976 internal::ext::extendInToFloat<Tin, Tout>(vecsIn, inF);
977 for (size_t i = 0; i < nFloatVecs; i++) {
978 resF[i] = mul(add(inF[i], offVec), facVec);
979 }
980 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
981}
982
983// ---------------------------------------------------------------------------
984// multiply then add with float constant in float arithmetic
985// ---------------------------------------------------------------------------
986
987// better for conversion of zero-centered data to unsigned pixel format
988
989// 15. Mar 23 (Jonas Keller): fused the three cases
990
1002template <typename Tout, typename Tin,
1003 typename Tfloat = BigEnoughFloat<Tout, Tin>, size_t SIMD_WIDTH>
1004static SIMD_INLINE void fmuladd(
1008{
1009 static_assert(sizeof(Tin) <= sizeof(Tfloat), "");
1010 static_assert(sizeof(Tout) <= sizeof(Tfloat), "");
1011 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
1012 const auto facVec = set1<Tfloat, SIMD_WIDTH>(fac);
1013 const auto offVec = set1<Tfloat, SIMD_WIDTH>(off);
1014 Vec<Tfloat, SIMD_WIDTH> inF[nFloatVecs], resF[nFloatVecs];
1015 internal::ext::extendInToFloat<Tin, Tout>(vecsIn, inF);
1016 for (size_t i = 0; i < nFloatVecs; i++) {
1017 resF[i] = add(mul(inF[i], facVec), offVec);
1018 }
1019 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
1020}
1021
1022// ---------------------------------------------------------------------------
1023// multiply with float constant in float arithmetic
1024// ---------------------------------------------------------------------------
1025
1026// 15. Mar 23 (Jonas Keller): fused the three cases
1027
1028// fac * [v2 + w * (v1 - v2)] = fac * [w * v1 + (1 - w) * v2], w in [0,1]
1029// w: weight factor (in [0,1])
1030// fac: scale factor
1031
1048template <typename Tout, typename Tin,
1049 typename Tfloat = BigEnoughFloat<Tout, Tin>, size_t SIMD_WIDTH>
1050static SIMD_INLINE void fwaddmul(
1055{
1056 static_assert(sizeof(Tin) <= sizeof(Tfloat), "");
1057 static_assert(sizeof(Tout) <= sizeof(Tfloat), "");
1058 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
1059 const auto wVec = set1<Tfloat, SIMD_WIDTH>(w);
1060 const auto facVec = set1<Tfloat, SIMD_WIDTH>(fac);
1061 Vec<Tfloat, SIMD_WIDTH> inF1[nFloatVecs], inF2[nFloatVecs], resF[nFloatVecs];
1062 internal::ext::extendInToFloat<Tin, Tout>(vecsIn1, inF1);
1063 internal::ext::extendInToFloat<Tin, Tout>(vecsIn2, inF2);
1064 for (size_t i = 0; i < nFloatVecs; i++) {
1065 resF[i] = mul(facVec, add(inF2[i], mul(wVec, sub(inF1[i], inF2[i]))));
1066 }
1067 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
1068}
1069
1077// ===========================================================================
1078// horizontal add/adds/sub/subs: generic form for multiple vector inputs
1079// ===========================================================================
1080
1081// TODO: is there an easy way to implement multivec horizontal min/max?
1082// TODO: (Hackers delight: min/max via doz = hsubs?)
1083
1084namespace internal {
1085namespace ext {
1086// primary template
1087// num: number of elements processed
1088// i0, i1: indices of lowest elements of block
1089template <typename T, size_t SIMD_WIDTH, size_t num, size_t i0, size_t i1>
1090struct Horizontal
1091{
1092 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(
1093 const Vec<T, SIMD_WIDTH> v[Vec<T, SIMD_WIDTH>::elems])
1094 {
1095 return hadd(Horizontal<T, SIMD_WIDTH, num / 2, i0, i0 + num / 4>::_hadd(v),
1096 Horizontal<T, SIMD_WIDTH, num / 2, i1, i1 + num / 4>::_hadd(v));
1097 }
1098
1099 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(
1100 const Vec<T, SIMD_WIDTH> v[Vec<T, SIMD_WIDTH>::elems])
1101 {
1102 return hadds(
1103 Horizontal<T, SIMD_WIDTH, num / 2, i0, i0 + num / 4>::_hadds(v),
1104 Horizontal<T, SIMD_WIDTH, num / 2, i1, i1 + num / 4>::_hadds(v));
1105 }
1106};
1107
1108// partial specialization to end the recursion
1109template <typename T, size_t SIMD_WIDTH, size_t i0, size_t i1>
1110struct Horizontal<T, SIMD_WIDTH, 2, i0, i1>
1111{
1112 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(
1113 const Vec<T, SIMD_WIDTH> v[Vec<T, SIMD_WIDTH>::elems])
1114 {
1115 return hadd(v[i0], v[i1]);
1116 }
1117
1118 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(
1119 const Vec<T, SIMD_WIDTH> v[Vec<T, SIMD_WIDTH>::elems])
1120 {
1121 return hadds(v[i0], v[i1]);
1122 }
1123};
1124} // namespace ext
1125} // namespace internal
1126
1127// function template
1128
1137template <typename T, size_t SIMD_WIDTH>
1138static SIMD_INLINE Vec<T, SIMD_WIDTH> hadd(
1140{
1141 return internal::ext::Horizontal<T, SIMD_WIDTH, Vec<T, SIMD_WIDTH>::elems, 0,
1142 (Vec<T, SIMD_WIDTH>::elems) / 2>::_hadd(v);
1143}
1144
1145// function template
1146
1156template <typename T, size_t SIMD_WIDTH>
1157static SIMD_INLINE Vec<T, SIMD_WIDTH> hadds(
1159{
1160 return internal::ext::Horizontal<T, SIMD_WIDTH, Vec<T, SIMD_WIDTH>::elems, 0,
1161 (Vec<T, SIMD_WIDTH>::elems) / 2>::_hadds(v);
1162}
1163
1164// ===========================================================================
1165// horizontal operations (generic form for single vector input)
1166// ===========================================================================
1167
1168// these operations are not fully parallel!
1169
1170// example: SIMD_WIDTH = 16, T = float
1171// extract<0>(Horizontal1<float,16,2>::_hadd(v));
1172// u = Horizontal1<float,16,1>::_hadd(v);
1173// hadd(v, v)
1174// hadd(u, u)
1175
1176namespace internal {
1177namespace ext {
1178template <typename T, size_t SIMD_WIDTH, size_t NUM>
1179struct Horizontal1
1180{
1181 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(const Vec<T, SIMD_WIDTH> &v)
1182 {
1183 Vec<T, SIMD_WIDTH> u = Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hadd(v);
1184 return hadd(u, u);
1185 }
1186
1187 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(const Vec<T, SIMD_WIDTH> &v)
1188 {
1189 Vec<T, SIMD_WIDTH> u = Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hadds(v);
1190 return hadds(u, u);
1191 }
1192
1193 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmin(const Vec<T, SIMD_WIDTH> &v)
1194 {
1195 return Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hmin(min(srle<NUM>(v), v));
1196 }
1197
1198 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmax(const Vec<T, SIMD_WIDTH> &v)
1199 {
1200 return Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hmax(max(srle<NUM>(v), v));
1201 }
1202};
1203
1204template <typename T, size_t SIMD_WIDTH>
1205struct Horizontal1<T, SIMD_WIDTH, 1>
1206{
1207 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(const Vec<T, SIMD_WIDTH> &v)
1208 {
1209 return hadd(v, v);
1210 }
1211
1212 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(const Vec<T, SIMD_WIDTH> &v)
1213 {
1214 return hadds(v, v);
1215 }
1216
1217 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmin(const Vec<T, SIMD_WIDTH> &v)
1218 {
1219 return min(srle<1>(v), v);
1220 }
1221
1222 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmax(const Vec<T, SIMD_WIDTH> &v)
1223 {
1224 return max(srle<1>(v), v);
1225 }
1226};
1227} // namespace ext
1228} // namespace internal
1229
1236template <typename T, size_t SIMD_WIDTH>
1237static SIMD_INLINE T hadd(const Vec<T, SIMD_WIDTH> &v)
1238{
1239 return extract<0>(
1240 internal::ext::Horizontal1<T, SIMD_WIDTH,
1241 SIMD_WIDTH / sizeof(T) / 2>::_hadd(v));
1242}
1243
1252template <typename T, size_t SIMD_WIDTH>
1253static SIMD_INLINE T hadds(const Vec<T, SIMD_WIDTH> &v)
1254{
1255 return extract<0>(
1256 internal::ext::Horizontal1<T, SIMD_WIDTH,
1257 SIMD_WIDTH / sizeof(T) / 2>::_hadds(v));
1258}
1259
1266template <typename T, size_t SIMD_WIDTH>
1267static SIMD_INLINE T hmin(const Vec<T, SIMD_WIDTH> &v)
1268{
1269 return extract<0>(
1270 internal::ext::Horizontal1<T, SIMD_WIDTH,
1271 SIMD_WIDTH / sizeof(T) / 2>::_hmin(v));
1272}
1273
1280template <typename T, size_t SIMD_WIDTH>
1281static SIMD_INLINE T hmax(const Vec<T, SIMD_WIDTH> &v)
1282{
1283 return extract<0>(
1284 internal::ext::Horizontal1<T, SIMD_WIDTH,
1285 SIMD_WIDTH / sizeof(T) / 2>::_hmax(v));
1286}
1287
1290// ===========================================================================
1291// iterative horizontal accumulation
1292// ===========================================================================
1293
1299// 04. Aug 23 (Jonas Keller): added classes for iterative horizontal
1300// accumulation
1301
1309template <class HOp, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
1310struct HAcc
1311{
1312private:
1313 size_t count = 0;
1314 size_t stackTop = 0;
1315 Vec<T, SIMD_WIDTH> stack[internal::ext::floorlog2(Vec<T, SIMD_WIDTH>::elems)];
1316
1317public:
1324 bool isEmpty() const { return count == 0; }
1325
1331 bool isDone() const { return count == Vec<T, SIMD_WIDTH>::elems; }
1332
1340 {
1341 if (isDone()) { return; }
1342 auto acc = v;
1343 for (size_t i = 0; count & (1 << i); i++) {
1344 stackTop--;
1345 acc = HOp::apply(stack[stackTop], acc);
1346 }
1347 stack[stackTop] = acc;
1348 stackTop++;
1349 count++;
1350 }
1351
1356 void finish()
1357 {
1358 while (!isDone()) {
1359 push(set1<T, SIMD_WIDTH>(HOp::template neutralValue<T>()));
1360 }
1361 }
1362
1370 {
1371 finish();
1372 return stack[0];
1373 }
1374
1378 void reset()
1379 {
1380 count = 0;
1381 stackTop = 0;
1382 }
1383};
1384
1393template <class HOp, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
1395{
1396private:
1397 T *const ptr;
1398 size_t index = 0;
1400
1401public:
1408 HAccStore(T *const p) : ptr(p) {}
1409
1410 ~HAccStore() { finish(); }
1411
1420 {
1421 hacc.push(v);
1422 if (hacc.isDone()) {
1423 // TODO: aligned/unaligned?
1424 storeu(&ptr[index], hacc.get());
1426 hacc.reset();
1427 }
1428 }
1429
1434 void finish()
1435 {
1436 // if hacc is not empty, we have to finish and store;
1437 // if hacc was reset after last push, nothing happens
1438 if (hacc.isEmpty()) return;
1439
1440 hacc.finish();
1441 storeu(&ptr[index], hacc.get());
1442 index = 0;
1443 hacc.reset();
1444 }
1445};
1446
1451struct HAdd
1452{
1453 // exclude from doxygen (until endcond)
1455 template <typename T, size_t SIMD_WIDTH>
1456 static SIMD_INLINE Vec<T, SIMD_WIDTH> apply(const Vec<T, SIMD_WIDTH> &a,
1457 const Vec<T, SIMD_WIDTH> &b)
1458 {
1459 return hadd(a, b);
1460 }
1461
1462 template <typename T>
1463 static SIMD_INLINE T neutralValue()
1464 {
1465 return T(0);
1466 }
1468};
1469
1475struct HAdds
1476{
1477 // exclude from doxygen (until endcond)
1479 template <typename T, size_t SIMD_WIDTH>
1480 static SIMD_INLINE Vec<T, SIMD_WIDTH> apply(const Vec<T, SIMD_WIDTH> &a,
1481 const Vec<T, SIMD_WIDTH> &b)
1482 {
1483 return hadds(a, b);
1484 }
1485
1486 template <typename T>
1487 static SIMD_INLINE T neutralValue()
1488 {
1489 return T(0);
1490 }
1492};
1493
1498struct HMin
1499{
1500 // exclude from doxygen (until endcond)
1502 template <typename T, size_t SIMD_WIDTH>
1503 static SIMD_INLINE Vec<T, SIMD_WIDTH> apply(const Vec<T, SIMD_WIDTH> &a,
1504 const Vec<T, SIMD_WIDTH> &b)
1505 {
1506 return min(a, b);
1507 }
1508
1509 template <typename T>
1510 static SIMD_INLINE T neutralValue()
1511 {
1512 return std::numeric_limits<T>::max();
1513 }
1515};
1516
1521struct HMax
1522{
1523 // exclude from doxygen (until endcond)
1525 template <typename T, size_t SIMD_WIDTH>
1526 static SIMD_INLINE Vec<T, SIMD_WIDTH> apply(const Vec<T, SIMD_WIDTH> &a,
1527 const Vec<T, SIMD_WIDTH> &b)
1528 {
1529 return max(a, b);
1530 }
1531
1532 template <typename T>
1533 static SIMD_INLINE T neutralValue()
1534 {
1535 return std::numeric_limits<T>::lowest();
1536 }
1538};
1539
1547// ===========================================================================
1548// avgru: synonym f. average with rounding up
1549// ===========================================================================
1550
1551// this is just a synonym for avg which is compatible with the auxiliary avgrd
1552
1563template <typename T, size_t SIMD_WIDTH>
1564static SIMD_INLINE Vec<T, SIMD_WIDTH> avgru(const Vec<T, SIMD_WIDTH> &a,
1565 const Vec<T, SIMD_WIDTH> &b)
1566{
1567 return avg(a, b);
1568}
1569
1570// ===========================================================================
1571// avgrd: average with rounding down
1572// ===========================================================================
1573
1574// 30. Jul 17 (rm): removed unnecessary tag dispatching for avgrd()
1575
1576namespace internal {
1577namespace ext {
1578// int types
1579template <typename T, size_t SIMD_WIDTH,
1580 SIMD_ENABLE_IF(std::is_integral<T>::value)>
1581static SIMD_INLINE Vec<T, SIMD_WIDTH> avgrd(const Vec<T, SIMD_WIDTH> &a,
1582 const Vec<T, SIMD_WIDTH> &b)
1583{
1584 Vec<T, SIMD_WIDTH> one = set1<T, SIMD_WIDTH>(1), as, bs, lsb;
1585 lsb = bit_and(bit_and(a, b), one);
1586 as = div2rd(a);
1587 bs = div2rd(b);
1588 return add(lsb, add(as, bs));
1589}
1590
1591// NOTE: no rounding for floating-point types
1592template <typename T, size_t SIMD_WIDTH,
1593 SIMD_ENABLE_IF(std::is_floating_point<T>::value), typename = void>
1594static SIMD_INLINE Vec<T, SIMD_WIDTH> avgrd(const Vec<T, SIMD_WIDTH> &a,
1595 const Vec<T, SIMD_WIDTH> &b)
1596{
1597 return mul(add(a, b), set1<T, SIMD_WIDTH>(0.5));
1598}
1599} // namespace ext
1600} // namespace internal
1601
1610template <typename T, size_t SIMD_WIDTH>
1611static SIMD_INLINE Vec<T, SIMD_WIDTH> avgrd(const Vec<T, SIMD_WIDTH> &a,
1612 const Vec<T, SIMD_WIDTH> &b)
1613{
1614 return internal::ext::avgrd(a, b);
1615}
1616
1617// ===========================================================================
1618// div2r0: integer div. by 2 with round to 0 (for integers)
1619// ===========================================================================
1620
1621namespace internal {
1622namespace ext {
1623template <size_t SIMD_WIDTH>
1624static SIMD_INLINE Vec<Byte, SIMD_WIDTH> div2r0(const Vec<Byte, SIMD_WIDTH> &a)
1625{
1626 return srli<1>(a);
1627}
1628
1629// 16. Oct 22 (Jonas Keller): added missing version for SignedByte
1630template <size_t SIMD_WIDTH>
1631static SIMD_INLINE Vec<SignedByte, SIMD_WIDTH> div2r0(
1632 const Vec<SignedByte, SIMD_WIDTH> &a)
1633{
1634 // add 1 if number is negative
1635 return srai<1>(add(a, srli<7>(a)));
1636}
1637
1638template <size_t SIMD_WIDTH>
1639static SIMD_INLINE Vec<Word, SIMD_WIDTH> div2r0(const Vec<Word, SIMD_WIDTH> &a)
1640{
1641 return srli<1>(a);
1642}
1643
1644template <size_t SIMD_WIDTH>
1645static SIMD_INLINE Vec<Short, SIMD_WIDTH> div2r0(
1646 const Vec<Short, SIMD_WIDTH> &a)
1647{
1648 // add 1 if number is negative
1649 return srai<1>(add(a, srli<15>(a)));
1650}
1651
1652template <size_t SIMD_WIDTH>
1653static SIMD_INLINE Vec<Int, SIMD_WIDTH> div2r0(const Vec<Int, SIMD_WIDTH> &a)
1654{
1655 // add 1 if number is negative
1656 return srai<1>(add(a, srli<31>(a)));
1657}
1658
1659template <size_t SIMD_WIDTH>
1660static SIMD_INLINE Vec<Long, SIMD_WIDTH> div2r0(const Vec<Long, SIMD_WIDTH> &a)
1661{
1662 // add 1 if number is negative
1663 return srai<1>(add(a, srli<63>(a)));
1664}
1665
1666// NOTE: no rounding for float
1667template <size_t SIMD_WIDTH>
1668static SIMD_INLINE Vec<Float, SIMD_WIDTH> div2r0(
1669 const Vec<Float, SIMD_WIDTH> &a)
1670{
1671 return mul(set1<Float, SIMD_WIDTH>(0.5f), a);
1672}
1673
1674// NOTE: no rounding for double
1675template <size_t SIMD_WIDTH>
1676static SIMD_INLINE Vec<Double, SIMD_WIDTH> div2r0(
1677 const Vec<Double, SIMD_WIDTH> &a)
1678{
1679 return mul(set1<Double, SIMD_WIDTH>(0.5), a);
1680}
1681} // namespace ext
1682} // namespace internal
1683
1695template <typename T, size_t SIMD_WIDTH>
1697{
1698 return internal::ext::div2r0(a);
1699}
1700
1701// ===========================================================================
1702// div2rd: integer division by two with rounding down (for integers)
1703// ===========================================================================
1704
1705namespace internal {
1706namespace ext {
1707template <size_t SIMD_WIDTH>
1708static SIMD_INLINE Vec<Byte, SIMD_WIDTH> div2rd(const Vec<Byte, SIMD_WIDTH> &a)
1709{
1710 return srli<1>(a);
1711}
1712
1713// 16. Oct 22 (Jonas Keller): added missing version for SignedByte
1714template <size_t SIMD_WIDTH>
1715static SIMD_INLINE Vec<SignedByte, SIMD_WIDTH> div2rd(
1716 const Vec<SignedByte, SIMD_WIDTH> &a)
1717{
1718 return srai<1>(a);
1719}
1720
1721template <size_t SIMD_WIDTH>
1722static SIMD_INLINE Vec<Word, SIMD_WIDTH> div2rd(const Vec<Word, SIMD_WIDTH> &a)
1723{
1724 return srli<1>(a);
1725}
1726
1727template <size_t SIMD_WIDTH>
1728static SIMD_INLINE Vec<Short, SIMD_WIDTH> div2rd(
1729 const Vec<Short, SIMD_WIDTH> &a)
1730{
1731 return srai<1>(a);
1732}
1733
1734template <size_t SIMD_WIDTH>
1735static SIMD_INLINE Vec<Int, SIMD_WIDTH> div2rd(const Vec<Int, SIMD_WIDTH> &a)
1736{
1737 return srai<1>(a);
1738}
1739
1740template <size_t SIMD_WIDTH>
1741static SIMD_INLINE Vec<Long, SIMD_WIDTH> div2rd(const Vec<Long, SIMD_WIDTH> &a)
1742{
1743 return srai<1>(a);
1744}
1745
1746// NOTE: no rounding for float
1747template <size_t SIMD_WIDTH>
1748static SIMD_INLINE Vec<Float, SIMD_WIDTH> div2rd(
1749 const Vec<Float, SIMD_WIDTH> &a)
1750{
1751 return mul(set1<Float, SIMD_WIDTH>(0.5f), a);
1752}
1753
1754// NOTE: no rounding for double
1755template <size_t SIMD_WIDTH>
1756static SIMD_INLINE Vec<Double, SIMD_WIDTH> div2rd(
1757 const Vec<Double, SIMD_WIDTH> &a)
1758{
1759 return mul(set1<Double, SIMD_WIDTH>(0.5), a);
1760}
1761} // namespace ext
1762} // namespace internal
1763
1775template <typename T, size_t SIMD_WIDTH>
1777{
1778 return internal::ext::div2rd(a);
1779}
1780
1781// ===========================================================================
1782// sign function (Float and Double only)
1783// ===========================================================================
1784
1785// contributed by Benedikt Volkmer
1786// negate a, where b is negative
1787// note: contrary to IEEE 754, this function considers -0.0f to be negative
1788
1799template <typename T, size_t SIMD_WIDTH>
1800static SIMD_INLINE Vec<T, SIMD_WIDTH> sign(const Vec<T, SIMD_WIDTH> &a,
1801 const Vec<T, SIMD_WIDTH> &b)
1802{
1803 static_assert(std::is_floating_point<T>::value,
1804 "sign() is only available for floating-point types");
1805 // -0.0F aka. 0x80000000 aka. 1000...b
1806 return bit_xor(a, bit_and(set1<T, SIMD_WIDTH>(T(-0.0)), b));
1807}
1808
1809// ===========================================================================
1810// absDiff function
1811// ===========================================================================
1812
1813// contributed by Benedikt Volkmer
1814// 23. Mar 22 (rm): removed SFINAE enable_if construct
1815// (not compatible with C++98)
1816// Computes elementwise absolute difference of vectors
1817
1818namespace internal {
1819namespace ext {
1820
1821// Use these overloads of the function template if Type is unsigned
1822
1823template <size_t SIMD_WIDTH>
1824static SIMD_INLINE Vec<Byte, SIMD_WIDTH> absDiff(
1825 const Vec<Byte, SIMD_WIDTH> &v1, const Vec<Byte, SIMD_WIDTH> &v2)
1826{
1827 // Trick working around non-existing abs() for unsigned Type
1828 return bit_or(subs(v1, v2), subs(v2, v1));
1829}
1830
1831template <size_t SIMD_WIDTH>
1832static SIMD_INLINE Vec<Word, SIMD_WIDTH> absDiff(
1833 const Vec<Word, SIMD_WIDTH> &v1, const Vec<Word, SIMD_WIDTH> &v2)
1834{
1835 // Trick working around non-existing abs() for unsigned Type
1836 return bit_or(subs(v1, v2), subs(v2, v1));
1837}
1838
1839// Use these overloads of the function template if Type is signed
1840
1841template <typename T, size_t SIMD_WIDTH>
1842static SIMD_INLINE Vec<T, SIMD_WIDTH> absDiff(const Vec<T, SIMD_WIDTH> &v1,
1843 const Vec<T, SIMD_WIDTH> &v2)
1844{
1845 static_assert(std::is_signed<T>::value, "");
1846 return abs(sub(v1, v2));
1847}
1848} // namespace ext
1849} // namespace internal
1850
1859template <typename T, size_t SIMD_WIDTH>
1860static SIMD_INLINE Vec<T, SIMD_WIDTH> absDiff(const Vec<T, SIMD_WIDTH> &v1,
1861 const Vec<T, SIMD_WIDTH> &v2)
1862{
1863 return internal::ext::absDiff(v1, v2);
1864}
1865
1868namespace internal {
1869namespace ext {
1870
1871// ===========================================================================
1872// transpose
1873// ===========================================================================
1874
1875// -------------------- different unpack functions ---------------------------
1876
1877template <size_t PART, size_t NUM_ELEMS, typename T, size_t SIMD_WIDTH>
1878struct Unpack
1879{
1880 static SIMD_INLINE Vec<T, SIMD_WIDTH> _unpack(const Vec<T, SIMD_WIDTH> &a,
1881 const Vec<T, SIMD_WIDTH> &b)
1882 {
1883 return unpack<PART, NUM_ELEMS>(a, b);
1884 }
1885};
1886
1887template <size_t PART, size_t NUM_ELEMS, typename T, size_t SIMD_WIDTH>
1888struct Unpack16
1889{
1890 static SIMD_INLINE Vec<T, SIMD_WIDTH> _unpack(const Vec<T, SIMD_WIDTH> &a,
1891 const Vec<T, SIMD_WIDTH> &b)
1892 {
1893 return unpack16<PART, NUM_ELEMS>(a, b);
1894 }
1895};
1896
1897// ------------------------ transpose a single row ---------------------------
1898
1899// primary template
1900template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
1901 size_t SIMD_WIDTH,
1902 // INDEX: index of first input element to unpack
1903 // NLOHI: low/high unpack selector index
1904 // ELEMS: number of elements to unpack in this stage
1905 size_t INDEX, size_t NLOHI, size_t ELEMS>
1906struct Transpose1
1907{
1908 static constexpr auto PART = (NLOHI & 0x01);
1909 static constexpr auto NEXT = (NLOHI >> 1);
1910 static constexpr auto LIDX = INDEX;
1911 static constexpr auto RIDX = INDEX + ELEMS;
1912 static constexpr auto HALF = ELEMS / 2;
1913
1914 static SIMD_INLINE Vec<T, SIMD_WIDTH> _transpose1(
1915 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems])
1916 {
1917 // printf("_transpose1("
1918 // "INDEX=%d NLOHI=%d ELEMS=%d PART=%d LIDX=%d RIDX=%d HALF=%d)\n",
1919 // INDEX, NLOHI, ELEMS, PART, LIDX, RIDX, HALF);
1920 // TODO: T,SIMD_WIDTH necessary or can it be deduced from arguments?
1921 return Unpack<PART, ELEMS, T, SIMD_WIDTH>::_unpack(
1922 Transpose1<Unpack, T, SIMD_WIDTH, LIDX, NEXT, HALF>::_transpose1(inRows),
1923 Transpose1<Unpack, T, SIMD_WIDTH, RIDX, NEXT, HALF>::_transpose1(inRows));
1924 }
1925};
1926
1927// partial specialization to end the iteration (ELEMS=1)
1928template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
1929 size_t SIMD_WIDTH, size_t INDEX, size_t NLOHI>
1930struct Transpose1<Unpack, T, SIMD_WIDTH, INDEX, NLOHI, 1>
1931{
1932 static constexpr auto PART = (NLOHI & 0x01);
1933
1934 static SIMD_INLINE Vec<T, SIMD_WIDTH> _transpose1(
1935 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems])
1936 {
1937 // printf("_transpose1(INDEX=%d NLOHI=%d *ELEMS=%d PART=%d)\n",
1938 // INDEX, NLOHI, 1, PART);
1939 // TODO: T,SIMD_WIDTH necessary or can it be deduced from arguments?
1940 return Unpack<PART, 1, T, SIMD_WIDTH>::_unpack(inRows[INDEX],
1941 inRows[INDEX + 1]);
1942 }
1943};
1944
1945// ----------------------- transpose multiple rows --------------------------
1946
1947// primary template
1948template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
1949 size_t SIMD_WIDTH,
1950 // NUMROWS: total number of rows
1951 // NUM_TRANSPOSE_ROWS: number of rows to transpose
1952 // ROW: index of row to transpose
1953 size_t NUMROWS, size_t NUM_TRANSPOSE_ROWS, size_t ROW>
1954struct Transpose
1955{
1956 static SIMD_INLINE void _transpose(
1957 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1958 Vec<T, SIMD_WIDTH> outRows[NUM_TRANSPOSE_ROWS])
1959 {
1960 // printf("\n_transpose(NUMROWS=%d,ROW=%d)\n", NUMROWS, ROW);
1961 // transpose single row with index ROW
1962 outRows[ROW] =
1963 // INDEX=0, NLOWHI=ROW, ELEMS=NUMROWS/2
1964 Transpose1<Unpack, T, SIMD_WIDTH, 0, ROW, NUMROWS / 2>::_transpose1(
1965 inRows);
1966 // transpose next row
1967 // NUMROWS=NUMROWS, ROW=ROW+1
1968 Transpose<Unpack, T, SIMD_WIDTH, NUMROWS, NUM_TRANSPOSE_ROWS,
1969 ROW + 1>::_transpose(inRows, outRows);
1970 }
1971};
1972
1973// partial specialization to end the iteration
1974template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
1975 size_t SIMD_WIDTH, size_t NUMROWS, size_t NUM_TRANSPOSE_ROWS>
1976struct Transpose<Unpack, T, SIMD_WIDTH, NUMROWS, NUM_TRANSPOSE_ROWS,
1977 NUM_TRANSPOSE_ROWS>
1978{
1979 static SIMD_INLINE void _transpose(
1980 const Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems],
1981 Vec<T, SIMD_WIDTH>[NUM_TRANSPOSE_ROWS])
1982 {}
1983};
1984
1985// function template: partial transpose
1986template <size_t NUM_TRANSPOSE_ROWS, typename T, size_t SIMD_WIDTH>
1987static SIMD_INLINE void transpose_a_partial(
1988 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1989 Vec<T, SIMD_WIDTH> outRows[NUM_TRANSPOSE_ROWS])
1990{
1991 Transpose<Unpack, T, SIMD_WIDTH,
1992 // NUMROWS, NUM_TRANSPOSE_ROWS, ROW
1993 SIMD_WIDTH / sizeof(T), NUM_TRANSPOSE_ROWS, 0>::_transpose(inRows,
1994 outRows);
1995}
1996
1997// function template: full transpose
1998template <typename T, size_t SIMD_WIDTH>
1999static SIMD_INLINE void transpose_a(
2000 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2001 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2002{
2003 transpose_a_partial<SIMD_WIDTH / sizeof(T)>(inRows, outRows);
2004}
2005
2006// ===========================================================================
2007// copy matrix
2008// ===========================================================================
2009
2010// primary template
2011template <typename T, size_t SIMD_WIDTH, size_t ROW, size_t ROW_STOP>
2012struct CopyMatrix
2013{
2014 static_assert(ROW < ROW_STOP, "ROW must be less than ROW_STOP");
2015
2016 static SIMD_INLINE void _copy(Vec<T, SIMD_WIDTH> v[ROW_STOP],
2017 Vec<T, SIMD_WIDTH> v2[ROW_STOP])
2018 {
2019 v2[ROW] = v[ROW];
2020 CopyMatrix<T, SIMD_WIDTH, ROW + 1, ROW_STOP>::_copy(v, v2);
2021 }
2022};
2023
2024// partial specialization to end the iteration
2025template <typename T, size_t SIMD_WIDTH, size_t ROW_STOP>
2026struct CopyMatrix<T, SIMD_WIDTH, ROW_STOP, ROW_STOP>
2027{
2028 static SIMD_INLINE void _copy(Vec<T, SIMD_WIDTH>[ROW_STOP],
2029 Vec<T, SIMD_WIDTH>[ROW_STOP])
2030 {}
2031};
2032
2033// ===========================================================================
2034// Transpose Post-Process
2035// ===========================================================================
2036
2037// ------------------------ transpose post-process 16 ------------------------
2038// Used to post-process transposed matrix using unpack16
2039
2040// primary template
2041template <typename T, size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW,
2042 size_t ROW_STOP, size_t TRANSPOSE_WIDTH, size_t SRC_OFF,
2043 size_t DST_OFF>
2044struct TransposePostprocess16
2045{
2046 static SIMD_INLINE void _transpose(
2047 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2048 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2049 {
2050 // printf("%s", "\nTransposePostprocess16\n");
2051 // printf("TRANSPOSE_WIDTH=%d\n", TRANSPOSE_WIDTH);
2052 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2053 TRANSPOSE_WIDTH / 2, SRC_OFF,
2054 2 * DST_OFF>::_transpose(inRows, outRows);
2055 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2056 TRANSPOSE_WIDTH / 2,
2057 SRC_OFF + SIMD_WIDTH / TRANSPOSE_WIDTH,
2058 2 * DST_OFF + ROW_STOP>::_transpose(inRows, outRows);
2059 }
2060};
2061
2062// partial specialization
2063template <typename T, size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW,
2064 size_t ROW_STOP, size_t SRC_OFF, size_t DST_OFF>
2065struct TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP, 16,
2066 SRC_OFF, DST_OFF>
2067{
2068 static constexpr auto STEP = SIMD_WIDTH / 16;
2069 static constexpr auto SRC_ROW = SRC_OFF + ROW * STEP;
2070 static constexpr auto DST_ROW = DST_OFF + ROW;
2071
2072 static SIMD_INLINE void _transpose(
2073 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2074 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2075 {
2076 // printf("%s", "\nTransposePostprocess16\n");
2077 // printf("TRANSPOSE_WIDTH=%d\n", 16);
2078 // printf("SRC_ROW=%d DST_ROW=%d\n", SRC_ROW, DST_ROW);
2079 outRows[DST_ROW] = inRows[SRC_ROW];
2080 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW + 1, ROW_STOP, 16,
2081 SRC_OFF, DST_OFF>::_transpose(inRows, outRows);
2082 }
2083};
2084
2085// partial specialization to end the iteration
2086template <typename T, size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW_STOP,
2087 size_t SRC_OFF, size_t DST_OFF>
2088struct TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW_STOP, ROW_STOP, 16,
2089 SRC_OFF, DST_OFF>
2090{
2091 static SIMD_INLINE void _transpose(
2092 const Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems],
2093 Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems])
2094 {}
2095};
2096
2097// ------------------------ transpose post-process hub -----------------------
2098
2099// primary template
2100template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2101 size_t SIMD_WIDTH>
2102struct TransposePostprocess
2103{
2104 static SIMD_INLINE void _transpose(
2105 const Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems],
2106 Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems])
2107 {}
2108};
2109
2110// partial specialization to post-process Transpose<Unpack16>
2111template <typename T, size_t SIMD_WIDTH>
2112struct TransposePostprocess<Unpack16, T, SIMD_WIDTH>
2113{
2114 static constexpr auto NUMROWS = SIMD_WIDTH / sizeof(T);
2115 static constexpr auto ROW_STOP = 16 / sizeof(T);
2116
2117 static SIMD_INLINE void _transpose(
2118 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2119 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2120 {
2121 // printf("%s", "\nTransposePostprocess\n");
2122 // printf("SIMD_WIDTH=%d TYPE=%s\n", SIMD_WIDTH, TypeInfo<T>::name());
2123 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, 0, ROW_STOP, SIMD_WIDTH, 0,
2124 0>::_transpose(inRows, outRows);
2125 }
2126};
2127
2128// ===========================================================================
2129// transpose_b: Transpose<Unpack16> + post-process
2130// ===========================================================================
2131
2132// contributed by Adam Marschall
2133
2134// function template: full transpose
2135template <typename T, size_t SIMD_WIDTH>
2136static SIMD_INLINE void transpose_b(
2137 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2138 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2139{
2140 Vec<T, SIMD_WIDTH> tempRows[Vec<T, SIMD_WIDTH>::elements];
2141 Transpose<Unpack16, T, SIMD_WIDTH, SIMD_WIDTH / sizeof(T),
2142 SIMD_WIDTH / sizeof(T), 0>::_transpose(inRows, tempRows);
2143 TransposePostprocess<Unpack16, T, SIMD_WIDTH>::_transpose(tempRows, outRows);
2144}
2145
2146// ===========================================================================
2147// transpose_c: Transpose<Unpack16> - needs store16
2148// ===========================================================================
2149
2150// contributed by Adam Marschall
2151
2152// function template: full transpose (includes store16)
2153template <typename T, size_t SIMD_WIDTH>
2154static SIMD_INLINE void transpose_c(
2155 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2156 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2157{
2158 Vec<T, SIMD_WIDTH> tempOutRows[Vec<T, SIMD_WIDTH>::elems];
2159
2160 Transpose<Unpack16, T, SIMD_WIDTH, SIMD_WIDTH / sizeof(T),
2161 SIMD_WIDTH / sizeof(T), 0>::_transpose(inRows, tempOutRows);
2162
2163 // post-process with store16 ...
2164 const auto N = SIMD_WIDTH / sizeof(T);
2165 T outArray[N * N];
2166 storeu16(outArray, tempOutRows);
2167 // ... and load to outRows
2168 loadu(outArray, outRows, N);
2169}
2170
2171// ===========================================================================
2172// Transpose16: Template Class to transpose multiple rows with integrated
2173// Unpack16 post-process
2174// Uses Transpose1 to transpose single rows.
2175// ===========================================================================
2176
2177// ----------------------- transpose multiple rows --------------------------
2178
2179// primary template
2180template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2181 size_t SIMD_WIDTH,
2182 // NUMROWS: total number of rows
2183 // ROW: index of row to transpose
2184 size_t NUMROWS, size_t ROW, size_t ROW_STOP, size_t TRANSPOSE_WIDTH,
2185 size_t SRC_OFF, size_t DST_OFF>
2186struct Transpose16
2187{
2188 static SIMD_INLINE void _transpose(
2189 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2190 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2191 {
2192 Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2193 TRANSPOSE_WIDTH / 2, SRC_OFF, 2 * DST_OFF>::_transpose(inRows,
2194 outRows);
2195 Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2196 TRANSPOSE_WIDTH / 2, SRC_OFF + SIMD_WIDTH / TRANSPOSE_WIDTH,
2197 2 * DST_OFF + ROW_STOP>::_transpose(inRows, outRows);
2198 }
2199};
2200
2201// partial specialization to end first iteration
2202template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2203 size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW, size_t ROW_STOP,
2204 size_t SRC_OFF, size_t DST_OFF>
2205struct Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP, 16, SRC_OFF,
2206 DST_OFF>
2207{
2208 static constexpr auto STEP = SIMD_WIDTH / 16;
2209 static constexpr auto SRC_ROW = SRC_OFF + ROW * STEP;
2210 static constexpr auto DST_ROW = DST_OFF + ROW;
2211
2212 static SIMD_INLINE void _transpose(
2213 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2214 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2215 {
2216 // printf("\n_transpose_b(SRC=%d,DST=%d)", SRC_ROW, DST_ROW);
2217 // printf("\n ROW=%d,OFF=%d,STEP=%d", ROW, TRANSPOSE_OFFSET, STEP);
2218 // transpose single row with index SRC_ROW
2219 outRows[DST_ROW] = Transpose1<Unpack, T, SIMD_WIDTH,
2220 // INDEX=0, NLOWHI=SRC_ROW, ELEMS=NUMROWS/2
2221 0, SRC_ROW, NUMROWS / 2>::_transpose1(inRows);
2222 // transpose next row
2223 // NUMROWS=NUMROWS, ROW=ROW+1
2224 Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW + 1, ROW_STOP, 16, SRC_OFF,
2225 DST_OFF>::_transpose(inRows, outRows);
2226 }
2227};
2228
2229// partial specialization to end the iteration
2230template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2231 size_t SIMD_WIDTH, size_t NUMROWS, size_t ROW_STOP, size_t SRC_OFF,
2232 size_t DST_OFF>
2233struct Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW_STOP, ROW_STOP, 16,
2234 SRC_OFF, DST_OFF>
2235{
2236 static SIMD_INLINE void _transpose(
2237 const Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems],
2238 Vec<T, SIMD_WIDTH>[Vec<T, SIMD_WIDTH>::elems])
2239 {}
2240};
2241
2242// contributed by Adam Marschall
2243
2244template <typename T, size_t SIMD_WIDTH>
2245static SIMD_INLINE void transpose_d(
2246 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2247 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2248{
2249 Transpose16<Unpack16, T, SIMD_WIDTH,
2250 // NUMROWS, ROW, ROW_STOP
2251 SIMD_WIDTH / sizeof(T), 0, 16 / sizeof(T),
2252 // TRANSPOSE_WIDTH, SRC_OFF, DST_OFF
2253 SIMD_WIDTH, 0, 0>::_transpose(inRows, outRows);
2254}
2255
2256// ===========================================================================
2257// swizzle2_a (deinterleave)
2258// ===========================================================================
2259
2260// contributed by Adam Marschall
2261
2262// generalized from Marat Dukhan's solution referred to at
2263// https://stackoverflow.com/a/15377386/3852630
2264// takes 2*N input elements
2265
2266// TODO: swizzling chunks of multiple elements (useful?)
2267// TODO: could be possible by starting loop at sizeof(T) and
2268// TODO: using zip<NUM_ELEMS>
2269
2270// FINALBLKSIZE template argument is required since function is also
2271// used for transpose_e
2272template <size_t N, size_t FINALBLKSIZE, typename T, size_t SIMD_WIDTH>
2273static SIMD_INLINE void swizzle2_a(Vec<T, SIMD_WIDTH> v[2 * N])
2274{
2275 Vec<T, SIMD_WIDTH> v2[2 * N];
2276 for (size_t blkSize = 1; blkSize <= FINALBLKSIZE; blkSize *= 2) {
2277 // zip
2278 for (size_t src = 0, dst = 0; src < N; src++, dst += 2)
2279 zip<1>(v[src], v[src + N], v2[dst], v2[dst + 1]);
2280 // copy result back to v
2281 // TODO: swizzle2_a: check code produced by compiler for copying
2282 for (size_t i = 0; i < 2 * N; i++) v[i] = v2[i];
2283 }
2284}
2285
2286template <size_t N, typename T, size_t SIMD_WIDTH>
2287static SIMD_INLINE void swizzle2_a(Vec<T, SIMD_WIDTH> v[2 * N])
2288{
2289 swizzle2_a<N, Vec<T, SIMD_WIDTH>::elements>(v);
2290}
2291
2292// ===========================================================================
2293// transpose_e
2294// ===========================================================================
2295
2296// contributed by Adam Marschall
2297
2298template <typename T, size_t SIMD_WIDTH>
2299static SIMD_INLINE void transpose_e(
2300 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2301 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2302{
2303 constexpr auto num = Vec<T, SIMD_WIDTH>::elements;
2304 for (size_t i = 0; i < num; i++) outRows[i] = inRows[i];
2305 swizzle2_a<num / 2, num / 2>(outRows);
2306}
2307
2308// ===========================================================================
2309// swizzle2_b (deinterleave)
2310// ===========================================================================
2311
2312// contributed by Adam Marschall
2313
2314// generalized from Marat Dukhan's solution referred to at
2315// https://stackoverflow.com/a/15377386/3852630
2316// takes 2*N input elements
2317
2318// FINALBLKSIZE template argument is required since function is also
2319// used for transpose_f
2320template <size_t N, size_t FINALBLKSIZE, typename T, size_t SIMD_WIDTH>
2321static SIMD_INLINE void swizzle2_b(Vec<T, SIMD_WIDTH> v[2 * N])
2322{
2323 Vec<T, SIMD_WIDTH> v2[2 * N];
2324 const auto origReps = floorlog2(FINALBLKSIZE) + 1;
2325 const auto finalReps = origReps / 2;
2326 // printf("origReps=%d finalReps=%d\n", origReps, finalReps);
2327
2328 for (size_t rep = 0; rep < finalReps; rep++) {
2329 // zip there ...
2330 for (size_t src = 0, dst = 0; src < N; src++, dst += 2)
2331 zip<1>(v[src], v[src + N], v2[dst], v2[dst + 1]);
2332
2333 // ... and zip back again
2334 for (size_t src = 0, dst = 0; src < N; src++, dst += 2)
2335 zip<1>(v2[src], v2[src + N], v[dst], v[dst + 1]);
2336 }
2337
2338 // skip post-amble in case of even origReps
2339 if (origReps % 2 == 0) return;
2340
2341 // zip there ...
2342 for (size_t src = 0, dst = 0; src < N; src++, dst += 2)
2343 zip<1>(v[src], v[src + N], v2[dst], v2[dst + 1]);
2344
2345 // ...and copy back again
2346 for (size_t i = 0; i < 2 * N; i++) v[i] = v2[i];
2347}
2348
2349template <size_t N, typename T, size_t SIMD_WIDTH>
2350static SIMD_INLINE void swizzle2_b(Vec<T, SIMD_WIDTH> v[2 * N])
2351{
2352 swizzle2_b<N, Vec<T, SIMD_WIDTH>::elements>(v);
2353}
2354
2355// ===========================================================================
2356// transpose_f
2357// ===========================================================================
2358
2359// contributed by Adam Marschall
2360
2361template <typename T, size_t SIMD_WIDTH>
2362static SIMD_INLINE void transpose_f(
2363 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2364 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2365{
2366 const auto elems = Vec<T, SIMD_WIDTH>::elements;
2367 for (size_t i = 0; i < elems; i++) outRows[i] = inRows[i];
2368 swizzle2_b<elems / 2, elems / 2>(outRows);
2369}
2370
2371// ===========================================================================
2372// Swizzle2 meta template
2373// ===========================================================================
2374
2375// contributed by Adam Marschall
2376
2377// -------------------- different zip functions ------------------------------
2378
2379template <size_t NUM_ELEMS, typename T, size_t SIMD_WIDTH>
2380struct Zip
2381{
2382 static SIMD_INLINE void _zip(Vec<T, SIMD_WIDTH> a, Vec<T, SIMD_WIDTH> b,
2383 Vec<T, SIMD_WIDTH> &l, Vec<T, SIMD_WIDTH> &h)
2384 {
2385 zip<NUM_ELEMS, T>(a, b, l, h);
2386 }
2387};
2388
2389template <size_t NUM_ELEMS, typename T, size_t SIMD_WIDTH>
2390struct Zip16
2391{
2392 static SIMD_INLINE void _zip(Vec<T, SIMD_WIDTH> a, Vec<T, SIMD_WIDTH> b,
2393 Vec<T, SIMD_WIDTH> &l, Vec<T, SIMD_WIDTH> &h)
2394 {
2395 zip16<NUM_ELEMS, T>(a, b, l, h);
2396 }
2397};
2398
2399// ------------------------ swizzle matrix once ------------------------------
2400
2401// primary template
2402template <template <size_t, typename, size_t> class Zip, typename T,
2403 size_t SIMD_WIDTH, size_t N, size_t SRC, size_t DST>
2404struct Swizzle2Once
2405{
2406 static constexpr auto SRC2 = SRC + N;
2407 static constexpr auto DST2 = DST + 1;
2408
2409 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2410 Vec<T, SIMD_WIDTH> v2[2 * N])
2411 {
2412 // printf("%s\n", "SwizzleOnce");
2413 // printf(" SRC=%d, SRC2=%d, DST=%d, DS2T=%d\n", SRC, SRC2, DST, DST2);
2414 Zip<1, T, SIMD_WIDTH>::_zip(v[SRC], v[SRC2], v2[DST], v2[DST2]);
2415 Swizzle2Once<Zip, T, SIMD_WIDTH, N, SRC + 1, DST + 2>::_swizzle(v, v2);
2416 }
2417};
2418
2419// partial specialization to end the iteration
2420template <template <size_t, typename, size_t> class Zip, typename T,
2421 size_t SIMD_WIDTH, size_t N, size_t DST>
2422struct Swizzle2Once<Zip, T, SIMD_WIDTH, N, N, DST>
2423{
2424 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH>[2 * N] /*v*/,
2425 Vec<T, SIMD_WIDTH>[2 * N] /*v2*/)
2426 {
2427 // for (size_t i = 0; i < 2 * N; i++) {
2428 // print("%4d", v2[i]);
2429 // puts("");
2430 // }
2431 }
2432};
2433
2434// ------------------------ swizzle matrix multiple times --------------------
2435
2436// primary template
2437template <template <size_t, typename, size_t> class Zip, typename T,
2438 size_t SIMD_WIDTH, size_t N, size_t REP, size_t FINAL_REPS,
2439 size_t ODD>
2440struct Swizzle2Multiple
2441{
2442 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2443 Vec<T, SIMD_WIDTH> v2[2 * N])
2444 {
2445 // printf("%s\n", "SwizzleMultiple");
2446 // printf(" REP=%d, FINAL_REPS=%d\n", REP, FINAL_REPS);
2447 Swizzle2Once<Zip, T, SIMD_WIDTH, N, 0, 0>::_swizzle(v, v2);
2448 Swizzle2Once<Zip, T, SIMD_WIDTH, N, 0, 0>::_swizzle(v2, v);
2449 Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, REP + 1, FINAL_REPS, ODD>::_swizzle(
2450 v, v2);
2451 }
2452};
2453
2454// partial specialization to end the iteration without swizzle post-amble
2455template <template <size_t, typename, size_t> class Zip, typename T,
2456 size_t SIMD_WIDTH, size_t N, size_t FINAL_REPS, size_t ODD>
2457struct Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, ODD>
2458{
2459 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH>[2 * N],
2460 Vec<T, SIMD_WIDTH>[2 * N])
2461 {}
2462};
2463
2464// partial specialization to end the iteration with swizzle post-amble
2465template <template <size_t, typename, size_t> class Zip, typename T,
2466 size_t SIMD_WIDTH, size_t N, size_t FINAL_REPS>
2467struct Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, 1>
2468{
2469 static constexpr auto ROW_STOP = 2 * N;
2470
2471 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2472 Vec<T, SIMD_WIDTH> v2[2 * N])
2473 {
2474 // printf("%s\n", "SwizzlePostamble");
2475 Swizzle2Once<Zip, T, SIMD_WIDTH, N, 0, 0>::_swizzle(v, v2);
2476 CopyMatrix<T, SIMD_WIDTH, 0, ROW_STOP>::_copy(v2, v);
2477 }
2478};
2479
2480// ------------------------ swizzle main meta template -----------------------
2481
2482// generalized from Marat Dukhan's solution referred to at
2483// https://stackoverflow.com/a/15377386/3852630
2484
2485// primary template
2486template <template <size_t, typename, size_t> class Zip, typename T,
2487 size_t SIMD_WIDTH, size_t N, size_t FINALBLKSIZE>
2488struct Swizzle2
2489{
2490 static constexpr auto ORIG_REPS = floorlog2(FINALBLKSIZE) + 1;
2491 static constexpr auto FINAL_REPS = ORIG_REPS / 2;
2492 static constexpr auto ODD = (ORIG_REPS & 0x01);
2493
2494 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N])
2495 {
2496 Vec<T, SIMD_WIDTH> v2[2 * N];
2497 // printf("%s\n", "Swizzle");
2498 // printf(" N=%d, FINALBLKSIZE=%d\n", N, FINALBLKSIZE);
2499 // printf(" ORIG_REPS=%d, FINAL_REPS=%d, ODD=%d\n", ORIG_REPS, FINAL_REPS,
2500 // ODD);
2501 Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, 0, FINAL_REPS, ODD>::_swizzle(v,
2502 v2);
2503 }
2504};
2505
2506// ===========================================================================
2507// swizzle2_c wrapper function
2508// ===========================================================================
2509
2510// 15. Oct 22 (Jonas Keller): added swizzle2_c wrapper function
2511
2512template <size_t N, typename T, size_t SIMD_WIDTH>
2513static SIMD_INLINE void swizzle2_c(Vec<T, SIMD_WIDTH> v[2 * N])
2514{
2515 Swizzle2<Zip, T, SIMD_WIDTH, N, Vec<T, SIMD_WIDTH>::elements>::_swizzle(v);
2516}
2517
2518// ===========================================================================
2519// Unswizzle
2520// ===========================================================================
2521
2522// 15. Oct 22 (Jonas Keller): added Unswizzle classes
2523
2524// Note: Unlike the Swizzle2 classes, the Unswizzle classes do not have a
2525// template-template parameter for the Zip class.
2526// In the Swizzle2 classes, the Zip template parameter is used to choose
2527// between the zip and zip16 functions, which is needed by the Transpose_g
2528// classes. The Unswizzle classes are not used by Transpose_g, so the Zip
2529// template parameter is not needed.
2530
2531template <typename T, size_t SIMD_WIDTH, size_t N, size_t SRC, size_t DST>
2532struct UnswizzleOnce
2533{
2534 static SIMD_INLINE void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2535 Vec<T, SIMD_WIDTH> v2[2 * N])
2536 {
2537 unzip<1, T>(v[SRC], v[SRC + 1], v2[DST], v2[DST + N]);
2538 UnswizzleOnce<T, SIMD_WIDTH, N, SRC + 2, DST + 1>::_unswizzle(v, v2);
2539 }
2540};
2541
2542// partial specialization to end the iteration
2543template <typename T, size_t SIMD_WIDTH, size_t N, size_t SRC>
2544struct UnswizzleOnce<T, SIMD_WIDTH, N, SRC, N>
2545{
2546 static SIMD_INLINE void _unswizzle(Vec<T, SIMD_WIDTH>[2 * N],
2547 Vec<T, SIMD_WIDTH>[2 * N])
2548 {}
2549};
2550
2551template <typename T, size_t SIMD_WIDTH, size_t N, size_t REP,
2552 size_t FINAL_REPS, size_t ODD>
2553struct UnswizzleMultiple
2554{
2555 static SIMD_INLINE void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2556 Vec<T, SIMD_WIDTH> v2[2 * N])
2557 {
2558 UnswizzleOnce<T, SIMD_WIDTH, N, 0, 0>::_unswizzle(v, v2);
2559 UnswizzleOnce<T, SIMD_WIDTH, N, 0, 0>::_unswizzle(v2, v);
2560 UnswizzleMultiple<T, SIMD_WIDTH, N, REP + 1, FINAL_REPS, ODD>::_unswizzle(
2561 v, v2);
2562 }
2563};
2564
2565// partial specialization to end the iteration without unswizzle post-amble
2566template <typename T, size_t SIMD_WIDTH, size_t N, size_t FINAL_REPS,
2567 size_t ODD>
2568struct UnswizzleMultiple<T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, ODD>
2569{
2570 static SIMD_INLINE void _unswizzle(Vec<T, SIMD_WIDTH>[2 * N],
2571 Vec<T, SIMD_WIDTH>[2 * N])
2572 {}
2573};
2574
2575// partial specialization to end the iteration with unswizzle post-amble
2576template <typename T, size_t SIMD_WIDTH, size_t N, size_t FINAL_REPS>
2577struct UnswizzleMultiple<T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, 1>
2578{
2579 static SIMD_INLINE void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2580 Vec<T, SIMD_WIDTH> v2[2 * N])
2581 {
2582 UnswizzleOnce<T, SIMD_WIDTH, N, 0, 0>::_unswizzle(v, v2);
2583 CopyMatrix<T, SIMD_WIDTH, 0, 2 * N>::_copy(v2, v);
2584 }
2585};
2586
2587// ------------------------ unswizzle main meta template ---------------------
2588
2589template <typename T, size_t SIMD_WIDTH, size_t N>
2590struct Unswizzle
2591{
2592 static constexpr auto FINALBLKSIZE = Vec<T, SIMD_WIDTH>::elements;
2593 static constexpr auto ORIG_REPS = floorlog2(FINALBLKSIZE) + 1;
2594 static constexpr auto FINAL_REPS = ORIG_REPS / 2;
2595 static constexpr auto ODD = (ORIG_REPS & 0x01);
2596
2597 static SIMD_INLINE void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N])
2598 {
2599 Vec<T, SIMD_WIDTH> v2[2 * N];
2600 UnswizzleMultiple<T, SIMD_WIDTH, N, 0, FINAL_REPS, ODD>::_unswizzle(v, v2);
2601 }
2602};
2603
2604// ===========================================================================
2605// unswizzle_b wrapper function
2606// ===========================================================================
2607
2608// 15. Oct 22 (Jonas Keller): added unswizzle_b wrapper function
2609
2610template <size_t N, typename T, size_t SIMD_WIDTH>
2611static SIMD_INLINE void unswizzle_b(Vec<T, SIMD_WIDTH> v[2 * N])
2612{
2613 Unswizzle<T, SIMD_WIDTH, N>::_unswizzle(v);
2614}
2615
2616// ===========================================================================
2617// transpose_g Swizzle2<Zip>
2618// ===========================================================================
2619
2620// contributed by Adam Marschall
2621
2622template <typename T, size_t SIMD_WIDTH>
2623static SIMD_INLINE void transpose_g(
2624 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2625 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2626{
2627 const auto elems = Vec<T, SIMD_WIDTH>::elements;
2628 for (size_t i = 0; i < elems; i++) outRows[i] = inRows[i];
2629 Swizzle2<Zip, T, SIMD_WIDTH, elems / 2, elems / 2>::_swizzle(outRows);
2630}
2631
2632// ===========================================================================
2633// transpose_h: Swizzle2<Zip16> + Swizzle post-process
2634// ===========================================================================
2635
2636// contributed by Adam Marschall
2637
2638// ------------------------ swizzle post-process 16 once ---------------------
2639
2640// primary template
2641template <typename T, size_t SIMD_WIDTH, size_t N, size_t SRC, size_t DST,
2642 size_t LANE_ELEMS>
2643struct Swizzle2Postprocess16Once
2644{
2645 static constexpr auto SRC2 = SRC + 1;
2646 static constexpr auto DST2 = DST + N;
2647
2648 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2649 Vec<T, SIMD_WIDTH> v2[2 * N])
2650 {
2651 // printf("%s\n", "SwizzlePostprocess16Once");
2652 // printf(" SRC=%d, SRC2=%d, DST=%d, DS2T=%d\n", SRC, SRC2, DST, DST2);
2653 Zip16<LANE_ELEMS, T, SIMD_WIDTH>::_zip(v[SRC], v[SRC2], v2[DST], v2[DST2]);
2654 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, SRC + 2, DST + 1,
2655 LANE_ELEMS>::_swizzle(v, v2);
2656 }
2657};
2658
2659// partial specialization to end the iteration
2660template <typename T, size_t SIMD_WIDTH, size_t N, size_t SRC,
2661 size_t LANE_ELEMS>
2662struct Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, SRC, N, LANE_ELEMS>
2663{
2664 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH>[2 * N] /*v*/,
2665 Vec<T, SIMD_WIDTH>[2 * N] /*v2*/)
2666 {
2667 // for (size_t i = 0; i < 2 * N; i++) {
2668 // print("%4d", v2[i]);
2669 // puts("");
2670 // }
2671 }
2672};
2673
2674// ------------------------ swizzle post-process 16 --------------------------
2675
2676// primary template
2677template <typename T, size_t SIMD_WIDTH, size_t N, size_t LANE_ELEMS,
2678 size_t REP, size_t FINAL_REPS, size_t ODD>
2679struct Swizzle2Postprocess16
2680{
2681 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2682 Vec<T, SIMD_WIDTH> v2[2 * N])
2683 {
2684 // printf("%s\n", "SwizzlePostprocess16");
2685 // printf(" REP=%d, FINAL_REPS=%d, LANE_ELEMS=%d\n", REP, FINAL_REPS,
2686 // LANE_ELEMS);
2687 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, 0, 0, LANE_ELEMS>::_swizzle(v,
2688 v2);
2689 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, 0, 0, LANE_ELEMS * 2>::_swizzle(
2690 v2, v);
2691 Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS * 4, REP + 1, FINAL_REPS,
2692 ODD>::_swizzle(v, v2);
2693 }
2694};
2695
2696// partial specialization to end the iteration without post-process post-amble
2697template <typename T, size_t SIMD_WIDTH, size_t N, size_t LANE_ELEMS,
2698 size_t FINAL_REPS, size_t ODD>
2699struct Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS, FINAL_REPS,
2700 FINAL_REPS, ODD>
2701{
2702 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH>[2 * N],
2703 Vec<T, SIMD_WIDTH>[2 * N])
2704 {}
2705};
2706
2707// partial specialization to end the iteration with post-process post-amble
2708template <typename T, size_t SIMD_WIDTH, size_t N, size_t LANE_ELEMS,
2709 size_t FINAL_REPS>
2710struct Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS, FINAL_REPS,
2711 FINAL_REPS, 1>
2712{
2713 static constexpr auto ROW_STOP = 2 * N;
2714
2715 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2716 Vec<T, SIMD_WIDTH> v2[2 * N])
2717 {
2718 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, 0, 0, LANE_ELEMS>::_swizzle(v,
2719 v2);
2720 CopyMatrix<T, SIMD_WIDTH, 0, ROW_STOP>::_copy(v2, v);
2721 }
2722};
2723
2724// ------------------------ swizzle post-process hub -------------------------
2725
2726// primary template
2727template <template <size_t, typename, size_t> class Zip, typename T,
2728 size_t SIMD_WIDTH, size_t N>
2729struct Swizzle2Postprocess
2730{
2731 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH>[2 * N]) {}
2732};
2733
2734// partial specialization to post-process Swizzle<Zip16>
2735template <typename T, size_t SIMD_WIDTH, size_t N>
2736struct Swizzle2Postprocess<Zip16, T, SIMD_WIDTH, N>
2737{
2738 static constexpr auto ORIG_REPS = floorlog2(SIMD_WIDTH) - 4;
2739 static constexpr auto FINAL_REPS = ORIG_REPS / 2;
2740 static constexpr auto ODD = (ORIG_REPS & 0x01);
2741 static constexpr auto LANE_ELEMS = 16 / sizeof(T);
2742
2743 static SIMD_INLINE void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N])
2744 {
2745 Vec<T, SIMD_WIDTH> v2[2 * N];
2746 Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS, 0, FINAL_REPS,
2747 ODD>::_swizzle(v, v2);
2748 }
2749};
2750
2751// ------------------------ transpose_h function call -------------------------
2752
2753// contributed by Adam Marschall
2754
2755template <typename T, size_t SIMD_WIDTH>
2756static SIMD_INLINE void transpose_h(
2757 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2758 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2759{
2760 const auto elems = Vec<T, SIMD_WIDTH>::elements;
2761 for (size_t i = 0; i < elems; i++) outRows[i] = inRows[i];
2762 Swizzle2<Zip16, T, SIMD_WIDTH, elems / 2, elems / 2>::_swizzle(outRows);
2763 Swizzle2Postprocess<Zip16, T, SIMD_WIDTH, elems / 2>::_swizzle(outRows);
2764}
2765
2766// ===========================================================================
2767// transpose_i: register-count based transpose
2768// ===========================================================================
2769
2770// contributed by Adam Marschall
2771
2772// primary template: unpack repetition
2773template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2774 size_t SIMD_WIDTH, size_t PROCESS_ROW, size_t PROCESS_ROWS,
2775 size_t UNPACK_ELEMS, size_t UNPACK_REP, size_t UNPACK_REPS,
2776 size_t SUB_BASE, size_t SUB>
2777struct TransposeRcUnpackSingle
2778{
2779 static constexpr auto UNPACK_PART =
2780 (PROCESS_ROW >> (UNPACK_REPS - UNPACK_REP - 1)) & 0x01;
2781 static constexpr auto UNPACK_PART_NEXT =
2782 ((PROCESS_ROW + 1) >> (UNPACK_REPS - UNPACK_REP - 1)) & 0x01;
2783 static constexpr auto SRC1 = (PROCESS_ROW - SUB) * 2;
2784 static constexpr auto SRC2 = (PROCESS_ROW - SUB) * 2 + 1;
2785
2786 static SIMD_INLINE void _transpose(
2787 const Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS],
2788 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2789 {
2790 // printf("%2d <- Unpack<%d, %d, %s, %d>(%2d, %2d) SUB_BASE: %d, SUB: %d\n",
2791 // PROCESS_ROW, UNPACK_PART, UNPACK_ELEMS,
2792 // TypeInfo<T>::name(), SIMD_WIDTH, SRC1, SRC2, SUB_BASE, SUB);
2793 outRows[PROCESS_ROW] =
2794 Unpack<UNPACK_PART, UNPACK_ELEMS, T, SIMD_WIDTH>::_unpack(inRows[SRC1],
2795 inRows[SRC2]);
2796 TransposeRcUnpackSingle<
2797 Unpack, T, SIMD_WIDTH, PROCESS_ROW + 1, PROCESS_ROWS, UNPACK_ELEMS,
2798 UNPACK_REP, UNPACK_REPS, SUB_BASE,
2799 SUB + (UNPACK_PART_NEXT == 1 && (PROCESS_ROW + 1) % SUB_BASE == 0 ?
2800 SUB_BASE :
2801 0)>::_transpose(inRows, outRows);
2802 }
2803};
2804
2805// partial specialisation to end iteration PROCESS_REP
2806template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2807 size_t SIMD_WIDTH, size_t PROCESS_ROWS, size_t UNPACK_ELEMS,
2808 size_t UNPACK_REP, size_t UNPACK_REPS, size_t SUB_BASE, size_t SUB>
2809struct TransposeRcUnpackSingle<Unpack, T, SIMD_WIDTH, PROCESS_ROWS,
2810 PROCESS_ROWS, UNPACK_ELEMS, UNPACK_REP,
2811 UNPACK_REPS, SUB_BASE, SUB>
2812{
2813 static SIMD_INLINE void _transpose(
2814 const Vec<T, SIMD_WIDTH>[PROCESS_ROWS] /*inRows*/,
2815 Vec<T, SIMD_WIDTH>[PROCESS_ROWS] /*outRows*/)
2816 {
2817 // printf("%2d\n", PROCESS_ROWS);
2818 // for (size_t i = 0; i < PROCESS_ROWS; i++) {
2819 // print("%5d", outRows[i]);
2820 // puts("");
2821 // }
2822 // puts("");
2823 }
2824};
2825
2826// ---------------------------------------------------------------------------
2827
2828// primary template: unpack repetition
2829template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2830 size_t SIMD_WIDTH, size_t UNPACK_REP, size_t UNPACK_REPS,
2831 size_t PROCESS_ROWS, size_t UNPACK_ELEMS, size_t SUB_BASE,
2832 size_t UNPACK_ODD>
2833struct TransposeRcUnpackMultiple
2834{
2835 static SIMD_INLINE void _transpose(Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS],
2836 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2837 {
2838 // printf("\nTransposeRcUnpackMultiple %2d %s %d/%d\n",
2839 // SIMD_WIDTH, TypeInfo<T>::name(), UNPACK_REP+1, UNPACK_REPS);
2840 TransposeRcUnpackSingle<Unpack, T, SIMD_WIDTH, 0, PROCESS_ROWS,
2841 UNPACK_ELEMS, UNPACK_REP, UNPACK_REPS, SUB_BASE,
2842 0>::_transpose(inRows, outRows);
2843 TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, UNPACK_REP + 1,
2844 UNPACK_REPS, PROCESS_ROWS, UNPACK_ELEMS * 2,
2845 SUB_BASE / 2, UNPACK_ODD>::_transpose(outRows,
2846 inRows);
2847 }
2848};
2849
2850// partial specialisation to end iteration UNPACK_REP
2851template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2852 size_t SIMD_WIDTH, size_t UNPACK_REPS, size_t PROCESS_ROWS,
2853 size_t UNPACK_ELEMS, size_t SUB_BASE, size_t UNPACK_ODD>
2854struct TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, UNPACK_REPS,
2855 UNPACK_REPS, PROCESS_ROWS, UNPACK_ELEMS,
2856 SUB_BASE, UNPACK_ODD>
2857{
2858 static SIMD_INLINE void _transpose(Vec<T, SIMD_WIDTH>[PROCESS_ROWS],
2859 Vec<T, SIMD_WIDTH>[PROCESS_ROWS])
2860 {}
2861};
2862
2863// partial specialisation to end iteration UNPACK_REP
2864template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2865 size_t SIMD_WIDTH, size_t UNPACK_REPS, size_t PROCESS_ROWS,
2866 size_t UNPACK_ELEMS, size_t SUB_BASE>
2867struct TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, UNPACK_REPS,
2868 UNPACK_REPS, PROCESS_ROWS, UNPACK_ELEMS,
2869 SUB_BASE, 0>
2870{
2871 static SIMD_INLINE void _transpose(Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS],
2872 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2873 {
2874 // printf("\nTransposeRcUnpackMultiple %2d %s %d/%d Copy Matrix\n",
2875 // SIMD_WIDTH, TypeInfo<T>::name(), UNPACK_REPS, UNPACK_REPS);
2876 CopyMatrix<T, SIMD_WIDTH, 0, PROCESS_ROWS>::_copy(inRows, outRows);
2877 }
2878};
2879
2880// ---------------------------------------------------------------------------
2881
2882// primary template: store all registers lane-wise
2883template <typename T, size_t SIMD_WIDTH, size_t PROCESS_REP,
2884 size_t PROCESS_REPS, size_t PROCESS_ROWS, size_t UNPACK_REPS,
2885 size_t STORE_OFF, size_t VO, size_t LANE>
2886struct TransposeRcStoreLane
2887{
2888 static constexpr auto VEC_ELEMS_OUT = Vec<T, SIMD_WIDTH>::elems;
2889
2890 static SIMD_INLINE void _store(
2892 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2893 {
2894 storeu(outArray + STORE_OFF, extractLane<LANE>(outRows[VO]));
2895 TransposeRcStoreLane<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS, PROCESS_ROWS,
2896 UNPACK_REPS, STORE_OFF + PROCESS_ROWS * VEC_ELEMS_OUT,
2897 VO, LANE + 1>::_store(outArray, outRows);
2898 }
2899};
2900
2901// partial specialisation to end iteration LANE=PROCESS_REPS
2902template <typename T, size_t SIMD_WIDTH, size_t PROCESS_REP,
2903 size_t PROCESS_REPS, size_t PROCESS_ROWS, size_t UNPACK_REPS,
2904 size_t STORE_OFF, size_t VO>
2905struct TransposeRcStoreLane<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2906 PROCESS_ROWS, UNPACK_REPS, STORE_OFF, VO,
2907 PROCESS_REPS>
2908{
2909 static SIMD_INLINE void _store(
2911 Vec<T, SIMD_WIDTH>[PROCESS_ROWS])
2912 {}
2913};
2914
2915// ---------------------------------------------------------------------------
2916
2917// primary template: store all registers lane-wise
2918template <typename T, size_t SIMD_WIDTH, size_t PROCESS_REP,
2919 size_t PROCESS_REPS, size_t PROCESS_ROWS, size_t UNPACK_REPS,
2920 size_t STORE_OFF, size_t VO>
2921struct TransposeRcStoreLanes
2922{
2923 static constexpr auto VEC_ELEMS_OUT = Vec<T, SIMD_WIDTH>::elems;
2924
2925 static SIMD_INLINE void _store(
2927 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2928 {
2929 TransposeRcStoreLane<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS, PROCESS_ROWS,
2930 UNPACK_REPS, STORE_OFF, VO, 0>::_store(outArray,
2931 outRows);
2932 TransposeRcStoreLanes<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2933 PROCESS_ROWS, UNPACK_REPS, STORE_OFF + VEC_ELEMS_OUT,
2934 VO + 1>::_store(outArray, outRows);
2935 }
2936};
2937
2938// partial specialisation to end iteration VO=PROCESS_ROWS
2939template <typename T, size_t SIMD_WIDTH, size_t PROCESS_REP,
2940 size_t PROCESS_REPS, size_t PROCESS_ROWS, size_t UNPACK_REPS,
2941 size_t STORE_OFF>
2942struct TransposeRcStoreLanes<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2943 PROCESS_ROWS, UNPACK_REPS, STORE_OFF, PROCESS_ROWS>
2944{
2945 static SIMD_INLINE void _store(
2947 Vec<T, SIMD_WIDTH>[PROCESS_ROWS])
2948 {}
2949};
2950
2951// ---------------------------------------------------------------------------
2952
2953// primary template: store hub
2954// decides whether to store directly (Store16) or to store lane-wise
2955template <typename T, size_t SIMD_WIDTH, size_t PROCESS_REP,
2956 size_t PROCESS_REPS, size_t PROCESS_ROWS, size_t UNPACK_REPS>
2957struct TransposeRcStore
2958{
2959 static constexpr auto ELEMS_PER_LANE = 16 / sizeof(T);
2960 static constexpr auto STORE_OFF = PROCESS_REP * ELEMS_PER_LANE;
2961
2962 static SIMD_INLINE void _store(
2964 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2965 {
2966 TransposeRcStoreLanes<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2967 PROCESS_ROWS, UNPACK_REPS, STORE_OFF,
2968 0>::_store(outArray, outRows);
2969 }
2970};
2971
2972template <typename T, size_t SIMD_WIDTH, size_t PROCESS_REP,
2973 size_t PROCESS_ROWS, size_t UNPACK_REPS>
2974struct TransposeRcStore<T, SIMD_WIDTH, PROCESS_REP, 1, PROCESS_ROWS,
2975 UNPACK_REPS>
2976{
2977 static SIMD_INLINE void _store(
2979 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2980 {
2981 // printf("\nStore16 PROCESS_ROWS=%d\n", PROCESS_ROWS);
2982 Store16<Store, T, SIMD_WIDTH, PROCESS_ROWS, 0, 16 / sizeof(T), SIMD_WIDTH,
2983 0, 0>::_store16(outArray, outRows);
2984 }
2985};
2986
2987// ---------------------------------------------------------------------------
2988
2989// primary template: main repetition
2990// loads, transposes, stores chunk of matrix
2991template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
2992 size_t SIMD_WIDTH, size_t PROCESS_REP, size_t PROCESS_REPS,
2993 size_t PROCESS_ROWS, size_t UNPACK_REPS, size_t UNPACK_ODD>
2994struct TransposeRcRep
2995{
2996 static constexpr auto LOAD_OFF =
2997 PROCESS_REP * PROCESS_ROWS * SIMD_WIDTH / sizeof(T);
2998 static constexpr auto SUB_BASE = 1 << (floorlog2(PROCESS_ROWS) - 1);
2999
3000 static SIMD_INLINE void _transpose(
3003 {
3004 // printf("\nTransposeRcRep %2d %s %d/%d\n",
3005 // SIMD_WIDTH, TypeInfo<T>::name(), PROCESS_REP+1,
3006 // PROCESS_REPS);
3007 Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS];
3008 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS];
3009 load(inArray + LOAD_OFF, inRows, PROCESS_ROWS);
3010 // for (size_t i = 0; i < PROCESS_ROWS; i++) {
3011 // print("%5d", inRows[i]);
3012 // puts("");
3013 // }
3014 // puts("");
3015 TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, 0, UNPACK_REPS,
3016 PROCESS_ROWS, 1, SUB_BASE,
3017 UNPACK_ODD>::_transpose(inRows, outRows);
3018 // for (size_t i = 0; i < PROCESS_ROWS; i++) {
3019 // print("%5d", outRows[i]);
3020 // puts("");
3021 // }
3022 // puts("");
3023 TransposeRcStore<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS, PROCESS_ROWS,
3024 UNPACK_REPS>::_store(outArray, outRows);
3025 TransposeRcRep<Unpack, T, SIMD_WIDTH, PROCESS_REP + 1, PROCESS_REPS,
3026 PROCESS_ROWS, UNPACK_REPS, UNPACK_ODD>::_transpose(inArray,
3027 outArray);
3028 }
3029};
3030
3031// partial specialisation to end iteration PROCESS_REP
3032template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
3033 size_t SIMD_WIDTH, size_t PROCESS_REPS, size_t PROCESS_ROWS,
3034 size_t UNPACK_REPS, size_t UNPACK_ODD>
3035struct TransposeRcRep<Unpack, T, SIMD_WIDTH, PROCESS_REPS, PROCESS_REPS,
3036 PROCESS_ROWS, UNPACK_REPS, UNPACK_ODD>
3037{
3038 static SIMD_INLINE void _transpose(
3041 {}
3042};
3043
3044// ---------------------------------------------------------------------------
3045
3046// primary template: main entrance
3047template <template <size_t, size_t, typename, size_t> class Unpack, typename T,
3048 size_t SIMD_WIDTH>
3049struct TransposeRc
3050{
3051 static constexpr auto SIMD_REGS = NATIVE_SIMD_REG_COUNT / 2;
3052 static constexpr auto NUM_ROWS = SIMD_WIDTH / sizeof(T);
3053 static constexpr auto PROCESS_REPS =
3054 NUM_ROWS > SIMD_REGS ? SIMD_WIDTH / 16 : 1;
3055 static constexpr auto PROCESS_ROWS = NUM_ROWS / PROCESS_REPS;
3056 static constexpr auto UNPACK_REPS =
3057 PROCESS_REPS == 1 ? floorlog2(PROCESS_ROWS) : floorlog2(16 / sizeof(T));
3058 static constexpr auto UNPACK_ODD = (UNPACK_REPS & 0x01);
3059
3060 static SIMD_INLINE void _transpose(
3063 {
3064 // printf("TransposeRc Process Rows: %d \n", PROCESS_ROWS);
3065 TransposeRcRep<Unpack, T, SIMD_WIDTH, 0, PROCESS_REPS, PROCESS_ROWS,
3066 UNPACK_REPS, UNPACK_ODD>::_transpose(inArray, outArray);
3067 // printf("%s","\n");
3068 }
3069};
3070
3071// ---------------------------------------------------------------------------
3072
3073// contributed by Adam Marschall
3074
3075// function template: full transpose
3076template <typename T, size_t SIMD_WIDTH>
3077static SIMD_INLINE void transpose_i(
3078 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
3079 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
3080{
3081 // 20. Sep 22 (Jonas Keller):
3082 // use simd_aligned_malloc for inArray and outArray
3083 // and free them at the end of the function
3084 // 30. Jul 23 (Jonas Keller):
3085 // put inArray and outArray on the stack instead of heap to avoid allocation
3086 // and possibly allow for better compiler optimisation
3087 const auto N = Vec<T, SIMD_WIDTH>::elements;
3088 T inArray[N * N] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
3089 T outArray[N * N] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
3090 store(inArray, inRows, N);
3091 TransposeRc<Unpack16, T, SIMD_WIDTH>::_transpose(inArray, outArray);
3092 load(outArray, outRows, N);
3093}
3094
3095// ===========================================================================
3096// unswizzle_a (interleave)
3097// ===========================================================================
3098
3099template <size_t N, typename T, size_t SIMD_WIDTH>
3100static SIMD_INLINE void unswizzle_a(Vec<T, SIMD_WIDTH> v[2 * N])
3101{
3102 const auto finalBlkSize = Vec<T, SIMD_WIDTH>::elements;
3103 Vec<T, SIMD_WIDTH> v2[2 * N];
3104 for (size_t blkSize = 1; blkSize <= finalBlkSize; blkSize *= 2) {
3105 // zip
3106 for (size_t dst = 0, src = 0; dst < N; dst++, src += 2)
3107 unzip<1>(v[src], v[src + 1], v2[dst], v2[dst + N]);
3108 // copy result back to v
3109 // TODO: unswizzle_a: check code produced by compiler for copying
3110 for (size_t i = 0; i < 2 * N; i++) v[i] = v2[i];
3111 }
3112}
3113
3114} // namespace ext
3115} // namespace internal
3116
3159template <size_t N, typename T, size_t SIMD_WIDTH>
3160static SIMD_INLINE void swizzle2(Vec<T, SIMD_WIDTH> v[2 * N])
3161{
3162 // uncomment fastest version
3163 // internal::ext::swizzle2_a<N>(v);
3164 // internal::ext::swizzle2_b<N>(v);
3165 internal::ext::swizzle2_c<N>(v);
3166}
3167
3202template <size_t N, typename T, size_t SIMD_WIDTH>
3203static SIMD_INLINE void unswizzle(Vec<T, SIMD_WIDTH> v[2 * N])
3204{
3205 // uncomment fastest version
3206 // internal::ext::unswizzle_a<N>(v);
3207 internal::ext::unswizzle_b<N>(v);
3208}
3209
3222template <typename T, size_t SIMD_WIDTH>
3223static SIMD_INLINE void transpose(
3226{
3227 // uncomment fastest version
3228
3229 // 06. Sep 23 (Jonas Keller):
3230 // added transpose1inplc, transpose2inplc, transpose1inplcLane and
3231 // transpose2inplcLane and switched to transpose1inplcLane
3232
3233 // internal::ext::transpose_a(inRows, outRows);
3234 // internal::ext::transpose_b(inRows, outRows);
3235 // internal::ext::transpose_c(inRows, outRows);
3236 // internal::ext::transpose_d(inRows, outRows);
3237 // internal::ext::transpose_e(inRows, outRows);
3238 // internal::ext::transpose_f(inRows, outRows);
3239 // internal::ext::transpose_g(inRows, outRows);
3240 // internal::ext::transpose_h(inRows, outRows);
3241 // internal::ext::transpose_i(inRows, outRows);
3242 // internal::ext::transpose1inplc(inRows, outRows);
3243 // internal::ext::transpose2inplc(inRows, outRows);
3244 internal::ext::transpose1inplcLane(inRows, outRows);
3245 // internal::ext::transpose2inplcLane(inRows, outRows);
3246}
3247
3259template <typename T, size_t SIMD_WIDTH>
3260static SIMD_INLINE void transpose(
3262{
3263 // uncomment fastest version
3264 // only auto-generated inplace versions are allowed here
3265 // internal::ext::transpose1inplc(rows);
3266 // internal::ext::transpose2inplc(rows);
3267 internal::ext::transpose1inplcLane(rows);
3268 // internal::ext::transpose2inplcLane(rows);
3269}
3270
3271// ========================================================================
3272// integration of values in simd::Vec
3273// ========================================================================
3274
3275// example:
3276// 0 1 2 3 4 5 6 7
3277// + - 0 1 2 3 4 5 6 slle(1)
3278// --------------------------------------
3279// 0 0-1 1-2 2-3 3-4 4-5 5-6 6-7
3280// + - - 0 0-1 1-2 2-3 3-4 4-5 slle(2)
3281// --------------------------------------
3282// 0 0-1 0-2 0-3 1-4 2-5 3-6 4-7
3283// + - - - - 0 0-1 0-2 0-3 slle(4)
3284// --------------------------------------
3285// 0 0-1 0-2 0-3 0-4 0-5 0-6 0-7
3286//
3287// problem: slle has immediate argument, T-SIMD: template parameter
3288
3289namespace internal {
3290namespace ext {
3291// primary template
3292template <typename T, size_t SIMD_WIDTH, int SHIFT, int END_SHIFT>
3293struct HInt
3294{
3295public:
3296 static SIMD_INLINE simd::Vec<T, SIMD_WIDTH> integrate(
3297 const simd::Vec<T, SIMD_WIDTH> &v)
3298 {
3299 return HInt<T, SIMD_WIDTH, 2 * SHIFT, END_SHIFT>::integrate(
3301 }
3302};
3303
3304// termination template
3305template <typename T, size_t SIMD_WIDTH, int END_SHIFT>
3306struct HInt<T, SIMD_WIDTH, END_SHIFT, END_SHIFT>
3307{
3308public:
3309 static SIMD_INLINE simd::Vec<T, SIMD_WIDTH> integrate(
3310 const simd::Vec<T, SIMD_WIDTH> &v)
3311 {
3312 return v;
3313 }
3314};
3315} // namespace ext
3316} // namespace internal
3317
3329template <typename T, size_t SIMD_WIDTH>
3331 const simd::Vec<T, SIMD_WIDTH> &v)
3332{
3333 return internal::ext::HInt<T, SIMD_WIDTH, 1,
3335}
3336
3337// ===========================================================================
3338// setones: set all bits to 1
3339// ===========================================================================
3340
3347template <typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3348static SIMD_INLINE Vec<T, SIMD_WIDTH> setones()
3349{
3351 return cmpeq(zero, zero);
3352}
3353
3354// ===========================================================================
3355// setmin / setmax: set all elements min./max. value of type without set1()
3356// setunity: set all elements to +1
3357// setnegunity: set all elements to -1
3358// ===========================================================================
3359
3367template <typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3368static SIMD_INLINE Vec<T, SIMD_WIDTH> setmin()
3369{
3370 return set1<T, SIMD_WIDTH>(std::numeric_limits<T>::lowest());
3371}
3372
3380template <typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3381static SIMD_INLINE Vec<T, SIMD_WIDTH> setmax()
3382{
3383 return set1<T, SIMD_WIDTH>(std::numeric_limits<T>::max());
3384}
3385
3392template <typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3393static SIMD_INLINE Vec<T, SIMD_WIDTH> setunity()
3394{
3395 return set1<T, SIMD_WIDTH>(T(1));
3396}
3397
3406template <typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3408{
3409 static_assert(std::is_signed<T>::value || std::is_floating_point<T>::value,
3410 "setnegunity() only available for signed integer and floating "
3411 "point types");
3412 return set1<T, SIMD_WIDTH>(T(-1));
3413}
3414
3415// ===========================================================================
3416// bitonic sort
3417// ===========================================================================
3418
3424// code contributed by Lukas Schiermeier and Moritz Breipohl, modified
3425
3426namespace internal {
3427namespace ext {
3428// compare-and-swap
3429template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3430struct Cas;
3431
3432// specialization for DESCENDING
3433template <typename T, size_t SIMD_WIDTH>
3434struct Cas<SortSlope::DESCENDING, T, SIMD_WIDTH>
3435{
3436 static void compareAndSwap(Vec<T, SIMD_WIDTH> &a, Vec<T, SIMD_WIDTH> &b)
3437 {
3438 Vec<T, SIMD_WIDTH> temp = min(a, b);
3439 a = max(a, b);
3440 b = temp;
3441 }
3442};
3443
3444// specialization for ASCENDING
3445template <typename T, size_t SIMD_WIDTH>
3446struct Cas<SortSlope::ASCENDING, T, SIMD_WIDTH>
3447{
3448 static void compareAndSwap(Vec<T, SIMD_WIDTH> &a, Vec<T, SIMD_WIDTH> &b)
3449 {
3450 Vec<T, SIMD_WIDTH> temp = max(a, b);
3451 a = min(a, b);
3452 b = temp;
3453 }
3454};
3455
3456// in-place sorting of multiple arbitrary vectors;
3457// transVecs have to be transposed vectors (same number of elements
3458// as in Vec), are still transposed afterwards
3459template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3460static SIMD_INLINE void bitonicSortTransposed(
3461 Vec<T, SIMD_WIDTH> transVecs[Vec<T, SIMD_WIDTH>::elems])
3462{
3463 constexpr auto numVecs = Vec<T, SIMD_WIDTH>::elements;
3464 /* Dependent Loops */
3465 for (size_t blkSize = 2; blkSize <= numVecs; blkSize *= 2) {
3466 /*
3467 * Bitonic Core
3468 * Independent Loops
3469 */
3470 for (size_t blkStart = 0; blkStart < numVecs; blkStart += blkSize) {
3471 size_t halfBlk = blkSize / 2;
3472 size_t leftCounter = blkStart;
3473 size_t rightCounter = blkStart + (blkSize - 1);
3474 /* Independent Loops */
3475 for (size_t i = 0; i < halfBlk; i++) {
3476 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(transVecs[leftCounter],
3477 transVecs[rightCounter]);
3478 leftCounter++;
3479 rightCounter--;
3480 }
3481 /*
3482 * This loop is skipped for blkSize < 4
3483 * Builds the second half of the bitonic core.
3484 *
3485 * Dependent Loops
3486 */
3487 for (size_t step = blkSize / 4; step > 0; step /= 2) {
3488 /* Independent Loops */
3489 for (size_t jump = 0; jump < blkSize; jump += step * 2) {
3490 leftCounter = blkStart + jump;
3491 rightCounter = blkStart + jump + step;
3492 /* Independent Loops */
3493 for (size_t k = 0; k < step; k++) {
3494 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(transVecs[leftCounter],
3495 transVecs[rightCounter]);
3496 leftCounter++;
3497 rightCounter++;
3498 }
3499 }
3500 }
3501 }
3502 }
3503}
3504
3505// post-fusion stage of bitonic sort, used to sort pairs of sorted vectors
3506// which were fused (one reversed) and then sorted such that the pair
3507// is sorted over the two vectors
3508// in-place sorting; transVecs have to be transposed vectors (same
3509// number of elements as in Vec), are still transposed
3510// afterwards
3511template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3512static SIMD_INLINE void bitonicSortReducedTransposed(
3513 Vec<T, SIMD_WIDTH> transVecs[Vec<T, SIMD_WIDTH>::elems])
3514{
3515 constexpr auto numVecs = Vec<T, SIMD_WIDTH>::elements;
3516 for (size_t step = numVecs / 2; step > 0; step /= 2) {
3517 /* Independent Loops */
3518 for (size_t jump = 0; jump < numVecs; jump += step * 2) {
3519 size_t leftCounter = jump;
3520 size_t rightCounter = jump + step;
3521 /* Independent Loops */
3522 for (size_t k = 0; k < step; k++) {
3523 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(transVecs[leftCounter],
3524 transVecs[rightCounter]);
3525 leftCounter++;
3526 rightCounter++;
3527 }
3528 }
3529 }
3530}
3531
3532// same as bitonicSortReducedTransposed, but including the transpose
3533template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3534static SIMD_INLINE void bitonicSortReduced(
3535 Vec<T, SIMD_WIDTH> vecs[Vec<T, SIMD_WIDTH>::elems])
3536{
3537 Vec<T, SIMD_WIDTH> transVecs[Vec<T, SIMD_WIDTH>::elements];
3538 transpose(vecs, transVecs);
3539 internal::ext::bitonicSortReducedTransposed<SLOPE>(transVecs);
3540 transpose(transVecs, vecs);
3541}
3542
3543} // namespace ext
3544} // namespace internal
3545
3553template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3554static SIMD_INLINE void bitonicSort(
3556{
3558 transpose(vecs, transVecs);
3559 internal::ext::bitonicSortTransposed<SLOPE>(transVecs);
3560 transpose(transVecs, vecs);
3561}
3562
3563namespace internal {
3564namespace ext {
3565// second vector is reversed and fused with first vector
3566// we don't have to reverse b after the compare-swap since it is
3567// bitonic
3568template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3569static SIMD_INLINE void bitonicFusion(Vec<T, SIMD_WIDTH> &a,
3570 Vec<T, SIMD_WIDTH> &b)
3571{
3572 b = reverse(b);
3573 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(a, b);
3574}
3575} // namespace ext
3576} // namespace internal
3577
3578// given sorted vectors as inputs, it fuses each consecutive pair
3579// such it is completely sorted over the pair
3580
3589template <SortSlope SLOPE, typename T, size_t SIMD_WIDTH_DEFAULT_NATIVE>
3590static SIMD_INLINE void bitonicSortSortedPairs(
3592{
3593 // Vec<T, SIMD_WIDTH> transVecs[Vec<T, SIMD_WIDTH>::elements];
3594 // second vector of each pair is reversed and fused with first vector
3595 for (size_t i = 0; i < Vec<T, SIMD_WIDTH>::elements; i += 2)
3596 internal::ext::bitonicFusion<SLOPE>(vecs[i], vecs[i + 1]);
3597 // transpose(vecs, transVecs);
3598 // internal::ext::bitonicSortReducedTransposed<SLOPE>(transVecs);
3599 // transpose(transVecs, vecs);
3600 internal::ext::bitonicSortReduced<SLOPE>(vecs);
3601}
3602
3615// contributed by Ebba Stina Siebold, modified
3616
3617template <size_t LENGTH, SortSlope SLOPE, typename T,
3618 size_t SIMD_WIDTH_DEFAULT_NATIVE>
3619static SIMD_INLINE void verticalBitonicSort(T data[LENGTH])
3620{
3621 // number of elements in SIMD vector
3622 constexpr size_t SIMD_ELEMS = Vec<T, SIMD_WIDTH>::elements;
3623 // number of SIMD vectors in data
3624 // LENGTH is assumed to be 2^n * SIMD_ELEMS^2 (n=0,1,2...)
3625 // -> NUM_VECS is 2^n * SIMD_ELEMS
3626 constexpr size_t NUM_VECS = LENGTH / SIMD_ELEMS;
3627 // number of vectors handled simultaneously by vertical bitonic sort
3628 // -> SORTING_STAGES is 2^n
3629 constexpr size_t SORTING_STAGES = NUM_VECS / SIMD_ELEMS;
3630 // examples if condition is fulfilled:
3631 // LENGTH=16, SIMD_ELEMS=4, NUM_VECS=4, SORTING_STAGES=1
3632 // LENGTH=32, SIMD_ELEMS=4, NUM_VECS=8, SORTING_STAGES=2
3633 // LENGTH=64, SIMD_ELEMS=4, NUM_VECS=16, SORTING_STAGES=4
3634 // example if condition on LENGTH is violated:
3635 // LENGTH=10, SIMD_ELEMS=4, NUM_VECS=2, SORTING_STAGES=0
3636 // -> SORTING_STAGES * SIMD_ELEMS * SIMD_ELEMS = 0 != 10
3637 // LENGTH=100, SIMD_ELEMS=4, NUM_VECS=25, SORTING_STAGES=5
3638 // -> SORTING_STAGES * SIMD_ELEMS * SIMD_ELEMS = 80 != 100
3639 static_assert(SORTING_STAGES * SIMD_ELEMS * SIMD_ELEMS == LENGTH,
3640 "LENGTH is not 2^n * SIMD_ELEMS^2");
3641
3642 // load data into vectors
3643 // TODO: is it efficient to handle this via a large array of Vec?
3644 Vec<T, SIMD_WIDTH> vecs[NUM_VECS];
3645 loadu<T, SIMD_WIDTH>(data, vecs, NUM_VECS);
3646 // sort all vectors individually
3647 for (size_t i = 0; i < SORTING_STAGES; i++) {
3648 // sort next SIMD_ELEMS vectors
3649 Vec<T, SIMD_WIDTH> *current_vecs = vecs + i * SIMD_ELEMS;
3650 // sort each vector in itself
3651 bitonicSort<SLOPE>(current_vecs);
3652 }
3653 // loop structure taken from bitonicSortTransposed
3654 for (size_t bulk_size = 2; bulk_size <= NUM_VECS; bulk_size *= 2) {
3655 // flip each lower half of each bulk and compare and swap with
3656 // the upper half
3657 for (size_t bulk_start = 0; bulk_start < NUM_VECS;
3658 bulk_start += bulk_size) {
3659 size_t half_bulk = bulk_size / 2;
3660 size_t left_counter = bulk_start;
3661 size_t right_counter = bulk_start + (bulk_size - 1);
3662 for (size_t i = 0; i < half_bulk; i++) {
3663 internal::ext::bitonicFusion<SLOPE>(vecs[left_counter],
3664 vecs[right_counter]);
3665 left_counter++;
3666 right_counter--;
3667 }
3668 }
3669 // distribute elements to individual vectors
3670 for (size_t bulk_start = 0; bulk_start < NUM_VECS;
3671 bulk_start += bulk_size) {
3672 for (size_t step = bulk_size / 4; step > 0; step /= 2) {
3673 for (size_t jump = 0; jump < bulk_size; jump += step * 2) {
3674 size_t left_counter = bulk_start + jump;
3675 size_t right_counter = bulk_start + jump + step;
3676 for (size_t k = 0; k < step; k++) {
3677 internal::ext::Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(
3678 vecs[left_counter], vecs[right_counter]);
3679 left_counter++;
3680 right_counter++;
3681 }
3682 }
3683 }
3684 }
3685 // restore order in individual vectors
3686 for (size_t i = 0; i < SORTING_STAGES; i++) {
3687 // sort next SIMD_ELEMS vectors
3688 Vec<T, SIMD_WIDTH> *current_vecs = vecs + i * SIMD_ELEMS;
3689 // sort each vector in itself
3690 // TODO: in-place version (1 arg.) of transpose slower?
3691 // TODO: put this into a function bitonicSortReduced?
3692 // transpose(current_vecs);
3693 // internal::ext::bitonicSortReducedTransposed<SLOPE>(current_vecs);
3694 // transpose(current_vecs);
3695 internal::ext::bitonicSortReduced<SLOPE>(current_vecs);
3696 }
3697 }
3698 // store vectors into original data array
3699 storeu<T, SIMD_WIDTH>(data, vecs, NUM_VECS);
3700}
3701
3704// ===========================================================================
3705// operators
3706// ===========================================================================
3707
3708// C++ Coding Standards p.49 (item 27)
3709
3710#define SIMDVEC_BINOPEQ(OP, FCT) \
3711 template <typename T, size_t SIMD_WIDTH> \
3712 static SIMD_INLINE Vec<T, SIMD_WIDTH> OP(Vec<T, SIMD_WIDTH> &a, \
3713 const Vec<T, SIMD_WIDTH> &b) \
3714 { \
3715 a = FCT(a, b); \
3716 return a; \
3717 }
3718
3719#define SIMDVEC_BINOP(OP, FCT) \
3720 template <typename T, size_t SIMD_WIDTH> \
3721 static SIMD_INLINE Vec<T, SIMD_WIDTH> OP(const Vec<T, SIMD_WIDTH> &a, \
3722 const Vec<T, SIMD_WIDTH> &b) \
3723 { \
3724 return FCT(a, b); \
3725 }
3726
3727#define SIMDVEC_UNOP(OP, FCT) \
3728 template <typename T, size_t SIMD_WIDTH> \
3729 static SIMD_INLINE Vec<T, SIMD_WIDTH> OP(const Vec<T, SIMD_WIDTH> &a) \
3730 { \
3731 return FCT(a); \
3732 }
3733
3734// limitations:
3735// - mul, div only for Float
3736// - neg only for signed types
3737
3744SIMDVEC_BINOP(operator+, adds)
3746SIMDVEC_BINOP(operator-, subs)
3748SIMDVEC_BINOP(operator*, mul)
3750SIMDVEC_BINOP(operator/, div)
3752SIMDVEC_BINOP(operator&, bit_and)
3754SIMDVEC_BINOP(operator|, bit_or)
3756SIMDVEC_BINOP(operator^, bit_xor)
3757
3759SIMDVEC_BINOPEQ(operator+=, adds)
3761SIMDVEC_BINOPEQ(operator-=, subs)
3763SIMDVEC_BINOPEQ(operator*=, mul)
3765SIMDVEC_BINOPEQ(operator/=, div)
3767SIMDVEC_BINOPEQ(operator&=, bit_and)
3769SIMDVEC_BINOPEQ(operator|=, bit_or)
3771SIMDVEC_BINOPEQ(operator^=, bit_xor)
3772
3774SIMDVEC_BINOP(operator>, cmpgt)
3776SIMDVEC_BINOP(operator>=, cmpge)
3778SIMDVEC_BINOP(operator==, cmpeq)
3780SIMDVEC_BINOP(operator!=, cmpneq)
3782SIMDVEC_BINOP(operator<=, cmple)
3784SIMDVEC_BINOP(operator<, cmplt)
3785
3787SIMDVEC_UNOP(operator-, neg)
3789SIMDVEC_UNOP(operator~, bit_not)
3790
3792} // namespace simd
3793
3794#endif
Iterative horizontal accumulator with store of the result. Calculates the horizontal accumulation of ...
Definition ext.H:1395
void push(Vec< T, SIMD_WIDTH > v)
Pushes the next Vec to be horizontally accumulated. Stores the result of the horizontal accumulation ...
Definition ext.H:1419
void finish()
Finishes the horizontal accumulation and stores the result of the horizontal accumulation into memory...
Definition ext.H:1434
HAccStore(T *const p)
Constructs a new HAccStore object.
Definition ext.H:1408
SIMD vector class, holds multiple elements of the same type.
Definition vec.H:75
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
static Vec< T, SIMD_WIDTH > sign(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Negates the elements of a Vec of floating-point numbers where the corresponding element of a second V...
Definition ext.H:1800
static Vec< T, SIMD_WIDTH > avgrd(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vecs, rounding down.
Definition ext.H:1611
static Vec< T, SIMD_WIDTH > sub(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's.
Definition base.H:388
static Vec< T, SIMD_WIDTH > subs(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's using saturated arithmetic.
Definition base.H:405
static Vec< T, SIMD_WIDTH > avg(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vec's, rounded up.
Definition base.H:456
static Vec< T, SIMD_WIDTH > div2rd(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds down the result.
Definition ext.H:1776
static Vec< T, SIMD_WIDTH > adds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's using saturated arithmetic.
Definition base.H:374
static Vec< T, SIMD_WIDTH > div2r0(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds the result to 0.
Definition ext.H:1696
static Vec< T, SIMD_WIDTH > div(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Divides the elements of two Vec's.
Definition base.H:439
static Vec< T, SIMD_WIDTH > absDiff(const Vec< T, SIMD_WIDTH > &v1, const Vec< T, SIMD_WIDTH > &v2)
Computes the absolute difference of the elements of two Vec's.
Definition ext.H:1860
static Vec< T, SIMD_WIDTH > avgru(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vec's, rounded up.
Definition ext.H:1564
static Vec< T, SIMD_WIDTH > add(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's.
Definition base.H:357
static Vec< T, SIMD_WIDTH > mul(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Multiplies the elements of two Vec's.
Definition base.H:421
static Vec< T, SIMD_WIDTH > cmplt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than ( < ).
Definition base.H:924
static Vec< T, SIMD_WIDTH > cmple(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than-or-equal ( <= ).
Definition base.H:945
static Vec< T, SIMD_WIDTH > cmpneq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for inequality ( != ).
Definition base.H:1029
static Vec< T, SIMD_WIDTH > cmpge(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than-or-equal ( >= ).
Definition base.H:987
static Vec< T, SIMD_WIDTH > cmpgt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than ( > ).
Definition base.H:1008
static Vec< T, SIMD_WIDTH > cmpeq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for equality ( == ).
Definition base.H:966
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
static Vec< T, SIMD_WIDTH > slle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec left by a constant number of elements, shifting in zero elements.
Definition base.H:1353
static Vec< T, SIMD_WIDTH > srle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec right by a constant number of elements, shifting in zero elements.
Definition base.H:1338
static T extract(const Vec< T, SIMD_WIDTH > &a)
Extracts a single value from a Vec.
Definition base.H:1072
static Vec< T, 16 > extractLane(const Vec< T, SIMD_WIDTH > &a)
Extracts a 16-byte lane from a Vec as a Vec < T, 16 >.
Definition base.H:1086
static void fdivmul(const Vec< Tin, SIMD_WIDTH > vecsNum[numInVecs< Tout, Tin >()], const Vec< Tin, SIMD_WIDTH > vecsDenom[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Divides Vec's element-wise, then multiplies with a constant factor in floating point arithmetic.
Definition ext.H:748
static void fwaddmul(const Vec< Tin, SIMD_WIDTH > vecsIn1[numInVecs< Tout, Tin >()], const Vec< Tin, SIMD_WIDTH > vecsIn2[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > w, dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Linearly interpolates Vec's element-wise with a constant weight and then scales by a constant factor ...
Definition ext.H:1050
static void fmul(const Vec< Tin, SIMD_WIDTH > vecsIn[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Multiplies Vec's element-wise with a floating point constant in floating point arithmetic.
Definition ext.H:931
static void fdivMsigmoidmul(const Vec< Tin, SIMD_WIDTH > vecsNum[DIM][NVEC], const Vec< Tin, SIMD_WIDTH > vecsDenom[DIM][NVEC], const double w[DIM], const double w0[DIM], double fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Special function used in MinWarping.
Definition ext.H:901
static void faddmul(const Vec< Tin, SIMD_WIDTH > vecsIn[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > off, dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Adds a floating point constant to the elements of Vec's, then multiplies with a floating point consta...
Definition ext.H:965
typename std::conditional< internal::vec::max(sizeof(Tout), sizeof(Tin))<= sizeof(Float), Float, Double >::type BigEnoughFloat
Smallest floating point type that is at least as big as the input and output types.
Definition vec.H:266
static void fmuladd(const Vec< Tin, SIMD_WIDTH > vecsIn[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > fac, dont_deduce< Tfloat > off, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Multiplies the elements of Vec's with a floating point constant, then adds a floating point constant ...
Definition ext.H:1004
static T hmax(const Vec< T, SIMD_WIDTH > &v)
Calculates the maximum of all elements of a Vec.
Definition ext.H:1281
static Vec< T, SIMD_WIDTH > hadds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's with saturation.
Definition base.H:493
static T hmin(const Vec< T, SIMD_WIDTH > &v)
Calculates the minimum of all elements of a Vec.
Definition ext.H:1267
static simd::Vec< T, SIMD_WIDTH > integrate(const simd::Vec< T, SIMD_WIDTH > &v)
Integrates the values of a Vec.
Definition ext.H:3330
static Vec< T, SIMD_WIDTH > hadd(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's.
Definition base.H:477
static Vec< T, SIMD_WIDTH > setunity()
Sets all elements of a Vec to the value 1.
Definition ext.H:3393
static Vec< T, SIMD_WIDTH > setmax()
Sets all elements of a Vec to the maximum value of the element type.
Definition ext.H:3381
static Vec< T, SIMD_WIDTH > setmin()
Sets all elements of a Vec to the minimum value of the element type.
Definition ext.H:3368
static Vec< T, SIMD_WIDTH > setnegunity()
Sets all elements of a Vec to the value -1.
Definition ext.H:3407
static Vec< T, SIMD_WIDTH > setones()
Sets all bits of a Vec to 1.
Definition ext.H:3348
static Vec< T, SIMD_WIDTH > setzero()
Returns a Vec with all elements set to zero.
Definition base.H:70
static Vec< T, SIMD_WIDTH > set1(const dont_deduce< T > a)
Returns a Vec with all elements set to the same value.
Definition base.H:88
Horizontal addition class for iterative horizontal accumulation.
Definition ext.H:1452
Horizontal saturated addition class for iterative horizontal accumulation.
Definition ext.H:1476
Horizontal maximum class for iterative horizontal accumulation.
Definition ext.H:1522
Horizontal minimum class for iterative horizontal accumulation.
Definition ext.H:1499
static Vec< T, SIMD_WIDTH > bit_and(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise AND of two Vec's.
Definition base.H:732
static Vec< T, SIMD_WIDTH > bit_xor(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise XOR of two Vec's.
Definition base.H:776
static Vec< T, SIMD_WIDTH > bit_or(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise OR of two Vec's.
Definition base.H:746
static Vec< T, SIMD_WIDTH > bit_not(const Vec< T, SIMD_WIDTH > &a)
Computes the bitwise NOT of a Vec.
Definition base.H:789
static Vec< T, SIMD_WIDTH > sqrt(const Vec< T, SIMD_WIDTH > &a)
Computes the square root of the elements of a Vec.
Definition base.H:584
static Vec< T, SIMD_WIDTH > min(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the minimum of the elements of two Vec's.
Definition base.H:606
static Vec< T, SIMD_WIDTH > max(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the maximum of the elements of two Vec's.
Definition base.H:620
static Vec< T, SIMD_WIDTH > neg(const Vec< T, SIMD_WIDTH > &a)
Negates the elements of a Vec.
Definition base.H:635
static Vec< T, SIMD_WIDTH > abs(const Vec< T, SIMD_WIDTH > &a)
Computes the absolute value of the elements of a Vec.
Definition base.H:654
static Vec< T, SIMD_WIDTH > load(const T *const p)
Loads a Vec from aligned memory.
Definition base.H:209
static Vec< T, SIMD_WIDTH > loadu(const T *const p)
Loads a Vec from unaligned memory.
Definition base.H:231
static void store(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to aligned memory.
Definition base.H:246
static void storeu(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to unaligned memory.
Definition base.H:265
static void load_storeu(const T *const src, T *const dst)
Copies a single Vec from one aligned memory location to another unaligned memory location.
Definition ext.H:491
static void loadu_storeu(const T *const src, T *const dst)
Copies a single Vec from one unaligned memory location to another unaligned memory location.
Definition ext.H:507
static void loadu_store(const T *const src, T *const dst)
Copies a single Vec from one unaligned memory location to another aligned memory location.
Definition ext.H:474
static void load_store(const T *const src, T *const dst)
Copies a single Vec from one aligned memory location to another aligned memory location.
Definition ext.H:457
static void print(const char *format, const Vec< T, SIMD_WIDTH > &vec)
Writes the formatted elements of a Vec to stdout.
Definition ext.H:157
static void fprint(FILE *f, const char *format, const Vec< T, SIMD_WIDTH > &vec)
Writes the formatted elements of a Vec to a file.
Definition ext.H:127
static void transpose(const Vec< T, SIMD_WIDTH > inRows[Vec< T, SIMD_WIDTH >::elems], Vec< T, SIMD_WIDTH > outRows[Vec< T, SIMD_WIDTH >::elems])
Transposes a matrix held in an array of Vec's.
Definition ext.H:3223
static Vec< T, SIMD_WIDTH > reverse(const Vec< T, SIMD_WIDTH > &a)
Reverses the order of the elements of a Vec.
Definition base.H:1101
static Vec< T, SIMD_WIDTH > srli(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in zeros.
Definition base.H:828
static Vec< T, SIMD_WIDTH > srai(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in the sign bit.
Definition base.H:812
static void bitonicSortSortedPairs(Vec< T, SIMD_WIDTH > vecs[Vec< T, SIMD_WIDTH >::elems])
Fuses consecutive pairs of sorted Vec's such that the pair is sorted over the two vectors.
Definition ext.H:3590
static void bitonicSort(Vec< T, SIMD_WIDTH > vecs[Vec< T, SIMD_WIDTH >::elems])
Sorts multiple Vec's independently using the bitonic sort algorithm.
Definition ext.H:3554
static void verticalBitonicSort(T data[LENGTH])
Sorts data vector using vertical version of bitonic sort. Assumes that data size is a power of 2 time...
Definition ext.H:3619
static void unswizzle(Vec< T, SIMD_WIDTH > v[2 *N])
Unswizzle/interleave/convert from SoA to AoS multiple Vec's in-place.
Definition ext.H:3203
static void swizzle2(Vec< T, SIMD_WIDTH > v[2 *N])
Swizzle/de-interleave/convert from AoS to SoA multiple Vec's in-place.
Definition ext.H:3160
static Vec< Tout, SIMD_WIDTH > cvts(const Vec< Tin, SIMD_WIDTH > &a)
Converts the elements of a Vec between integer and floating point types of the same size.
Definition base.H:1445
static Vec< Tout, SIMD_WIDTH > packs(const Vec< Tin, SIMD_WIDTH > &a, const Vec< Tin, SIMD_WIDTH > &b)
Packs two Vec's into one by converting the elements into the next smaller type with saturation.
Definition base.H:1397
static void extend(const Vec< Tin, SIMD_WIDTH > &vIn, Vec< Tout, SIMD_WIDTH > vOut[sizeof(Tout)/sizeof(Tin)])
Extends the elements of a Vec to a larger or equally sized type.
Definition base.H:1423
static constexpr size_t numInVecs()
Number of input vectors for functions that potentially change the size of the elements but not the nu...
Definition vec.H:201
static constexpr size_t numOutVecs()
Number of output vectors for functions that potentially change the size of the elements but not the n...
Definition vec.H:216
static void convert(const Vec< Tin, SIMD_WIDTH > inVecs[numInVecs< Tout, Tin >()], Vec< Tout, SIMD_WIDTH > outVecs[numOutVecs< Tout, Tin >()])
Converts (potentially multiple) Vec's between different types.
Definition ext.H:676
SortSlope
Used to indicate the direction of a sort function.
Definition types.H:115
static void zip16(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Interleaves blocks of elements of each 16-byte lane of two Vec's.
Definition base.H:1286
static void zip(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Interleaves blocks of elements of two Vec's.
Definition base.H:1247
static Vec< T, SIMD_WIDTH > unpack(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Interleaves blocks of elements from the high or low half of two Vec's.
Definition base.H:1169
static void unzip(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Deinterleaves blocks of elements two Vec's.
Definition base.H:1316
static Vec< T, SIMD_WIDTH > unpack16(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Interleaves blocks of elements from the high or low half of each 16-byte lane of two Vec's.
Definition base.H:1209
Namespace for T-SIMD.
Definition time_measurement.H:161
typename internal::dont_deduce< T >::type dont_deduce
Helper type to prevent template argument deduction.
Definition types.H:416
Iterative horizontal accumulator. Calculates the horizontal accumulation of multiple (Vec<T,...
Definition ext.H:1311
bool isEmpty() const
Checks if the horizontal accumulation is empty, i.e. if no Vec has been pushed yet.
Definition ext.H:1324
void push(const Vec< T, SIMD_WIDTH > &v)
Pushes the next Vec to be horizontally accumulated. Does nothing if the horizontal accumulation is al...
Definition ext.H:1339
Vec< T, SIMD_WIDTH > get()
Gets the result of the horizontal accumulation. Finishes the horizontal accumulation if it is not don...
Definition ext.H:1369
void reset()
Resets the horizontal accumulation.
Definition ext.H:1378
void finish()
Finishes the horizontal accumulation by pushing neutral values until the horizontal accumulation is d...
Definition ext.H:1356
bool isDone() const
Checks if the horizontal accumulation is done.
Definition ext.H:1331