41#ifndef SIMD_VEC_EXT_H_
42#define SIMD_VEC_EXT_H_
44#include "autogen/ext_transpose.H"
62static constexpr SIMD_INLINE T floorlog2(T x)
64 static_assert(std::is_integral<T>::value,
"");
65 return x == 1 ? 0 : 1 + floorlog2(x >> 1);
78#define NATIVE_SIMD_REG_COUNT 32
80#define NATIVE_SIMD_REG_COUNT 16
83#define NATIVE_SIMD_REG_COUNT 8
126template <
typename T,
size_t SIMD_WIDTH>
127static SIMD_INLINE
void fprint(FILE *f,
const char *format,
138 for (
size_t i = 0; i < elems; i++)
143 fprintf(f, format, buf[i]);
156template <
typename T,
size_t SIMD_WIDTH>
159 fprint(stdout, format, vec);
176template <
typename T,
size_t SIMD_WIDTH>
177static SIMD_INLINE
void fprint(FILE *f,
const char *format,
178 const char *separator,
185 std::string fmtSep = std::string(format) + std::string(separator);
186 fprint(f, fmtSep.c_str(), vec);
202template <
typename T,
size_t SIMD_WIDTH>
203static SIMD_INLINE
void print(
const char *format,
const char *separator,
206 fprint(stdout, format, separator, vec);
225template <
typename T,
size_t SIMD_WIDTH>
245template <
typename T,
size_t SIMD_WIDTH>
264template <
typename T,
size_t SIMD_WIDTH>
284template <
typename T,
size_t SIMD_WIDTH>
303template <
typename T,
size_t SIMD_WIDTH>
324template <
typename T,
size_t SIMD_WIDTH>
336template <
typename T,
size_t SIMD_WIDTH>
339 static SIMD_INLINE
void _store(T *
const p,
const Vec<T, SIMD_WIDTH> &outVec)
341 return store(p, outVec);
345template <
typename T,
size_t SIMD_WIDTH>
348 static SIMD_INLINE
void _store(T *
const p,
const Vec<T, SIMD_WIDTH> &outVec)
364template <
template <
typename,
size_t>
class Store,
typename T,
365 size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW,
size_t STORE_STOP,
366 size_t STORE_WIDTH,
size_t SRC_OFF,
size_t DST_OFF>
369 static SIMD_INLINE
void _store16(
374 Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW, STORE_STOP, STORE_WIDTH / 2,
375 SRC_OFF, 2 * DST_OFF>::_store16(p, outVecs);
376 Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW, STORE_STOP, STORE_WIDTH / 2,
377 SRC_OFF + SIMD_WIDTH / STORE_WIDTH,
378 2 * DST_OFF + STORE_STOP>::_store16(p, outVecs);
382template <
template <
typename,
size_t>
class Store,
typename T,
383 size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW,
size_t STORE_STOP,
384 size_t SRC_OFF,
size_t DST_OFF>
385struct Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW, STORE_STOP, 16, SRC_OFF,
388 static constexpr auto STEP = SIMD_WIDTH / 16;
389 static constexpr auto VO = SRC_OFF + ROW * STEP;
390 static constexpr auto OFF = (DST_OFF + ROW) * NUMROWS;
392 static SIMD_INLINE
void _store16(
396 Store<T, SIMD_WIDTH>::_store(p + OFF, outVecs[VO]);
397 Store16<Store, T, SIMD_WIDTH, NUMROWS, ROW + 1, STORE_STOP, 16, SRC_OFF,
398 DST_OFF>::_store16(p, outVecs);
402template <
template <
typename,
size_t>
class Store,
typename T,
403 size_t SIMD_WIDTH,
size_t NUMROWS,
size_t STORE_STOP,
size_t SRC_OFF,
405struct Store16<Store, T, SIMD_WIDTH, NUMROWS, STORE_STOP, STORE_STOP, 16,
408 static SIMD_INLINE
void _store16(
415template <
typename T,
size_t SIMD_WIDTH>
416static SIMD_INLINE
void store16(
419 const auto numRows = SIMD_WIDTH /
sizeof(T);
420 const auto storeStop = 16 /
sizeof(T);
421 internal::ext::Store16<internal::ext::Store, T, SIMD_WIDTH, numRows, 0,
422 storeStop, SIMD_WIDTH, 0, 0>::_store16(p, outVecs);
425template <
typename T,
size_t SIMD_WIDTH>
426static SIMD_INLINE
void storeu16(
429 const auto numRows = SIMD_WIDTH /
sizeof(T);
430 const auto storeStop = 16 /
sizeof(T);
431 internal::ext::Store16<internal::ext::StoreU, T, SIMD_WIDTH, numRows, 0,
432 storeStop, SIMD_WIDTH, 0, 0>::_store16(p, outVecs);
456template <
size_t SIMD_WIDTH,
typename T>
457static SIMD_INLINE
void load_store(
const T *
const src, T *
const dst)
473template <
size_t SIMD_WIDTH,
typename T>
474static SIMD_INLINE
void loadu_store(
const T *
const src, T *
const dst)
490template <
size_t SIMD_WIDTH,
typename T>
491static SIMD_INLINE
void load_storeu(
const T *
const src, T *
const dst)
506template <
size_t SIMD_WIDTH,
typename T>
527template <
typename T,
size_t SIMD_WIDTH>
528static SIMD_INLINE Vec<T, SIMD_WIDTH> packs(
const Vec<T, SIMD_WIDTH> a[1],
529 OutputType<T>, Compression<1>)
536template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH>
537static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
const Vec<Tin, SIMD_WIDTH> a[2],
538 OutputType<Tout>, Compression<2>)
540 return packs<Tout>(a[0], a[1]);
546template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH,
547 SIMD_ENABLE_IF(
sizeof(Tout) == 1)>
548static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
const Vec<Tin, SIMD_WIDTH> a[4],
549 OutputType<Tout>, Compression<4>)
551 return packs<Tout>(packs<Short>(a[0], a[1]), packs<Short>(a[2], a[3]));
555template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH,
556 SIMD_ENABLE_IF(
sizeof(Tout) == 2),
typename =
void>
557static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
const Vec<Tin, SIMD_WIDTH> a[4],
558 OutputType<Tout>, Compression<4>)
560 return packs<Tout>(packs<Int>(a[0], a[1]), packs<Int>(a[2], a[3]));
565template <
typename Tout,
size_t SIMD_WIDTH>
566static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
567 const Vec<Double, SIMD_WIDTH> a[4], OutputType<Tout>, Compression<4>)
570 return packs<Tout>(packs<Int>(a[0], a[1]), packs<Int>(a[2], a[3]));
575template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH>
576static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
const Vec<Tin, SIMD_WIDTH> a[8],
577 OutputType<Tout>, Compression<8>)
580 return packs<Tout>(packs<Short>(a), packs<Short>(a + 4));
585template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH>
586static SIMD_INLINE Vec<Tout, SIMD_WIDTH> packs(
const Vec<Tin, SIMD_WIDTH> a[1],
587 OutputType<Tout>, Compression<1>)
589 static_assert(
sizeof(Tin) ==
sizeof(Tout),
"");
590 static_assert(std::is_floating_point<Tin>::value !=
591 std::is_floating_point<Tout>::value,
620template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH>
624 return internal::ext::packs(
625 a, internal::OutputType<Tout>(),
626 internal::Compression<
sizeof(Tin) /
sizeof(Tout)>());
635template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH,
636 SIMD_ENABLE_IF(
sizeof(Tout) <
sizeof(Tin))>
637static SIMD_INLINE
void convert(
638 const Vec<Tin, SIMD_WIDTH> inVecs[
sizeof(Tin) /
sizeof(Tout)],
639 Vec<Tout, SIMD_WIDTH> outVecs[1])
641 outVecs[0] = packs<Tout>(inVecs);
644template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH,
645 SIMD_ENABLE_IF(
sizeof(Tout) >=
sizeof(Tin)),
typename =
void>
646static SIMD_INLINE
void convert(
const Vec<Tin, SIMD_WIDTH> inVecs[1],
647 Vec<Tout, SIMD_WIDTH> outVecs[1])
649 extend(inVecs[0], outVecs);
675template <
typename Tout,
typename Tin,
size_t SIMD_WIDTH>
680 internal::ext::convert(inVecs, outVecs);
690template <
typename Tin,
typename Tout,
typename T
float>
691static constexpr SIMD_INLINE
size_t numCalcVecs()
695 "numCalcVecs() must be equal for input and output");
699template <
typename Tin,
typename Tout,
typename T
float,
size_t SIMD_WIDTH>
700static SIMD_INLINE
void extendInToFloat(
702 Vec<Tfloat, SIMD_WIDTH> floatVecs[numCalcVecs<Tout, Tin, Tfloat>()])
704 for (
size_t i = 0; i < numInVecs<Tout, Tin>(); ++i) {
705 extend(inVecs[i], &floatVecs[i *
sizeof(Tfloat) /
sizeof(Tin)]);
709template <
typename Tin,
typename Tout,
typename T
float,
size_t SIMD_WIDTH>
710static SIMD_INLINE
void packsOutFromFloat(
711 const Vec<Tfloat, SIMD_WIDTH> floatVecs[numCalcVecs<Tout, Tin, Tfloat>()],
714 for (
size_t i = 0; i < numOutVecs<Tout, Tin>(); ++i) {
715 outVecs[i] = packs<Tout>(&floatVecs[i *
sizeof(Tfloat) /
sizeof(Tout)]);
746template <
typename Tout,
typename Tin,
754 static_assert(
sizeof(Tin) <=
sizeof(Tfloat),
"");
755 static_assert(
sizeof(Tout) <=
sizeof(Tfloat),
"");
756 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
760 internal::ext::extendInToFloat<Tin, Tout>(vecsNum, numF);
761 internal::ext::extendInToFloat<Tin, Tout>(vecsDenom, denomF);
762 for (
size_t i = 0; i < nFloatVecs; i++) {
763 resF[i] =
mul(
div(numF[i], denomF[i]), facVec);
765 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
778template <
size_t DIM,
size_t NVEC,
typename Tout,
typename Tin,
779 size_t SIMD_WIDTH, SIMD_ENABLE_IF(
sizeof(Tout) <
sizeof(Tin))>
780static SIMD_INLINE
void fdivMsigmoidmul(
781 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
782 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC],
const double w[DIM],
783 const double w0[DIM],
double fac, Vec<Tout, SIMD_WIDTH> vecsOut[1])
785 const auto nIn =
sizeof(Tin) /
sizeof(Tout);
786 const auto fanIn =
sizeof(
Float) /
sizeof(Tin);
789 Vec<Float, SIMD_WIDTH> wF[DIM], w0F[DIM], numF[DIM][fanIn],
790 denomF[DIM][fanIn], resF[nIn * fanIn];
791 for (
size_t d = 0; d < DIM; d++) {
799 for (
size_t i = 0, k = 0; i < nIn; i++) {
800 for (
size_t d = 0; d < DIM; d++) {
801 extend(vecsNum[d][i], numF[d]);
802 extend(vecsDenom[d][i], denomF[d]);
804 for (
size_t j = 0; j < fanIn; j++, k++) {
806 for (
size_t d = 0; d < DIM; d++) {
807 yF =
add(yF,
mul(wF[d],
sub(
div(numF[d][j], denomF[d][j]), w0F[d])));
809 auto y4F =
mul(yF, yF);
814 vecsOut[0] = packs<Tout>(resF);
817template <
size_t DIM,
size_t NVEC,
typename Tout,
typename Tin,
818 size_t SIMD_WIDTH, SIMD_ENABLE_IF(
sizeof(Tout) >
sizeof(Tin)),
820static SIMD_INLINE
void fdivMsigmoidmul(
821 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
822 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC],
const double w[DIM],
823 const double w0[DIM],
double fac,
824 Vec<Tout, SIMD_WIDTH> vecsOut[
sizeof(Tout) /
sizeof(Tin)])
826 const auto nOut =
sizeof(Tout) /
sizeof(Tin);
827 const auto fanOut =
sizeof(
Float) /
sizeof(Tout);
830 Vec<Float, SIMD_WIDTH> wF[DIM], w0F[DIM], numF[DIM][nOut * fanOut],
831 denomF[DIM][nOut * fanOut], resF[fanOut];
832 for (
size_t d = 0; d < DIM; d++) {
835 extend(*vecsNum[d], numF[d]);
836 extend(*vecsDenom[d], denomF[d]);
841 for (
size_t i = 0, k = 0; i < nOut; i++) {
842 for (
size_t j = 0; j < fanOut; j++, k++) {
844 for (
size_t d = 0; d < DIM; d++) {
845 yF =
add(yF,
mul(wF[d],
sub(
div(numF[d][k], denomF[d][k]), w0F[d])));
847 auto y4F =
mul(yF, yF);
851 vecsOut[i] = packs<Tout>(resF);
855template <
size_t DIM,
size_t NVEC,
typename Tout,
typename Tin,
856 size_t SIMD_WIDTH, SIMD_ENABLE_IF(
sizeof(Tout) ==
sizeof(Tin)),
857 typename = void,
typename =
void>
858static SIMD_INLINE
void fdivMsigmoidmul(
859 const Vec<Tin, SIMD_WIDTH> vecsNum[DIM][NVEC],
860 const Vec<Tin, SIMD_WIDTH> vecsDenom[DIM][NVEC],
const double w[DIM],
861 const double w0[DIM],
double fac, Vec<Tout, SIMD_WIDTH> vecsOut[1])
863 const auto fanInOut =
sizeof(
Float) /
sizeof(Tin);
866 Vec<Float, SIMD_WIDTH> wF[DIM], w0F[DIM], numF[DIM][fanInOut],
867 denomF[DIM][fanInOut], resF[fanInOut];
868 for (
size_t d = 0; d < DIM; d++) {
871 extend(*vecsNum[d], numF[d]);
872 extend(*vecsDenom[d], denomF[d]);
875 for (
size_t j = 0; j < fanInOut; j++) {
877 for (
size_t d = 0; d < DIM; d++) {
878 yF =
add(yF,
mul(wF[d],
sub(
div(numF[d][j], denomF[d][j]), w0F[d])));
880 auto y4F =
mul(yF, yF);
884 vecsOut[0] = packs<Tout>(resF);
899template <
size_t DIM,
size_t NVEC,
typename Tout,
typename Tin,
904 const double w0[DIM],
double fac,
907 static_assert(
sizeof(Tin) <=
sizeof(
Float),
"");
908 static_assert(
sizeof(Tout) <=
sizeof(
Float),
"");
909 internal::ext::fdivMsigmoidmul<DIM, NVEC>(vecsNum, vecsDenom, w, w0, fac,
929template <
typename Tout,
typename Tin,
936 static_assert(
sizeof(Tin) <=
sizeof(Tfloat),
"");
937 static_assert(
sizeof(Tout) <=
sizeof(Tfloat),
"");
938 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
941 internal::ext::extendInToFloat<Tin, Tout>(vecsIn, inF);
942 for (
size_t i = 0; i < nFloatVecs; i++) { resF[i] =
mul(inF[i], facVec); }
943 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
963template <
typename Tout,
typename Tin,
970 static_assert(
sizeof(Tin) <=
sizeof(Tfloat),
"");
971 static_assert(
sizeof(Tout) <=
sizeof(Tfloat),
"");
972 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
976 internal::ext::extendInToFloat<Tin, Tout>(vecsIn, inF);
977 for (
size_t i = 0; i < nFloatVecs; i++) {
978 resF[i] =
mul(
add(inF[i], offVec), facVec);
980 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
1002template <
typename Tout,
typename Tin,
1009 static_assert(
sizeof(Tin) <=
sizeof(Tfloat),
"");
1010 static_assert(
sizeof(Tout) <=
sizeof(Tfloat),
"");
1011 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
1015 internal::ext::extendInToFloat<Tin, Tout>(vecsIn, inF);
1016 for (
size_t i = 0; i < nFloatVecs; i++) {
1017 resF[i] =
add(
mul(inF[i], facVec), offVec);
1019 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
1048template <
typename Tout,
typename Tin,
1056 static_assert(
sizeof(Tin) <=
sizeof(Tfloat),
"");
1057 static_assert(
sizeof(Tout) <=
sizeof(Tfloat),
"");
1058 constexpr auto nFloatVecs = internal::ext::numCalcVecs<Tout, Tin, Tfloat>();
1062 internal::ext::extendInToFloat<Tin, Tout>(vecsIn1, inF1);
1063 internal::ext::extendInToFloat<Tin, Tout>(vecsIn2, inF2);
1064 for (
size_t i = 0; i < nFloatVecs; i++) {
1065 resF[i] =
mul(facVec,
add(inF2[i],
mul(wVec,
sub(inF1[i], inF2[i]))));
1067 internal::ext::packsOutFromFloat<Tin, Tout>(resF, vecsOut);
1089template <
typename T,
size_t SIMD_WIDTH,
size_t num,
size_t i0,
size_t i1>
1092 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(
1095 return hadd(Horizontal<T, SIMD_WIDTH, num / 2, i0, i0 + num / 4>::_hadd(v),
1096 Horizontal<T, SIMD_WIDTH, num / 2, i1, i1 + num / 4>::_hadd(v));
1099 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(
1103 Horizontal<T, SIMD_WIDTH, num / 2, i0, i0 + num / 4>::_hadds(v),
1104 Horizontal<T, SIMD_WIDTH, num / 2, i1, i1 + num / 4>::_hadds(v));
1109template <
typename T,
size_t SIMD_WIDTH,
size_t i0,
size_t i1>
1110struct Horizontal<T, SIMD_WIDTH, 2, i0, i1>
1112 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(
1115 return hadd(v[i0], v[i1]);
1118 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(
1121 return hadds(v[i0], v[i1]);
1137template <
typename T,
size_t SIMD_WIDTH>
1141 return internal::ext::Horizontal<T, SIMD_WIDTH, Vec<T, SIMD_WIDTH>::elems, 0,
1156template <
typename T,
size_t SIMD_WIDTH>
1160 return internal::ext::Horizontal<T, SIMD_WIDTH, Vec<T, SIMD_WIDTH>::elems, 0,
1178template <
typename T,
size_t SIMD_WIDTH,
size_t NUM>
1181 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(
const Vec<T, SIMD_WIDTH> &v)
1183 Vec<T, SIMD_WIDTH> u = Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hadd(v);
1187 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(
const Vec<T, SIMD_WIDTH> &v)
1189 Vec<T, SIMD_WIDTH> u = Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hadds(v);
1193 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmin(
const Vec<T, SIMD_WIDTH> &v)
1195 return Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hmin(
min(
srle<NUM>(v), v));
1198 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmax(
const Vec<T, SIMD_WIDTH> &v)
1200 return Horizontal1<T, SIMD_WIDTH, NUM / 2>::_hmax(
max(
srle<NUM>(v), v));
1204template <
typename T,
size_t SIMD_WIDTH>
1205struct Horizontal1<T, SIMD_WIDTH, 1>
1207 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadd(
const Vec<T, SIMD_WIDTH> &v)
1212 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hadds(
const Vec<T, SIMD_WIDTH> &v)
1217 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmin(
const Vec<T, SIMD_WIDTH> &v)
1222 static SIMD_INLINE Vec<T, SIMD_WIDTH> _hmax(
const Vec<T, SIMD_WIDTH> &v)
1236template <
typename T,
size_t SIMD_WIDTH>
1240 internal::ext::Horizontal1<T, SIMD_WIDTH,
1241 SIMD_WIDTH /
sizeof(T) / 2>::_hadd(v));
1252template <
typename T,
size_t SIMD_WIDTH>
1256 internal::ext::Horizontal1<T, SIMD_WIDTH,
1257 SIMD_WIDTH /
sizeof(T) / 2>::_hadds(v));
1266template <
typename T,
size_t SIMD_WIDTH>
1270 internal::ext::Horizontal1<T, SIMD_WIDTH,
1271 SIMD_WIDTH /
sizeof(T) / 2>::_hmin(v));
1280template <
typename T,
size_t SIMD_WIDTH>
1284 internal::ext::Horizontal1<T, SIMD_WIDTH,
1285 SIMD_WIDTH /
sizeof(T) / 2>::_hmax(v));
1309template <
class HOp,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
1314 size_t stackTop = 0;
1341 if (
isDone()) {
return; }
1343 for (
size_t i = 0; count & (1 << i); i++) {
1345 acc = HOp::apply(stack[stackTop], acc);
1347 stack[stackTop] = acc;
1393template <
class HOp,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
1455 template <
typename T,
size_t SIMD_WIDTH>
1462 template <
typename T>
1463 static SIMD_INLINE T neutralValue()
1479 template <
typename T,
size_t SIMD_WIDTH>
1486 template <
typename T>
1487 static SIMD_INLINE T neutralValue()
1502 template <
typename T,
size_t SIMD_WIDTH>
1509 template <
typename T>
1510 static SIMD_INLINE T neutralValue()
1512 return std::numeric_limits<T>::max();
1525 template <
typename T,
size_t SIMD_WIDTH>
1532 template <
typename T>
1533 static SIMD_INLINE T neutralValue()
1535 return std::numeric_limits<T>::lowest();
1563template <
typename T,
size_t SIMD_WIDTH>
1579template <
typename T,
size_t SIMD_WIDTH,
1580 SIMD_ENABLE_IF(std::is_integral<T>::value)>
1581static SIMD_INLINE Vec<T, SIMD_WIDTH> avgrd(
const Vec<T, SIMD_WIDTH> &a,
1582 const Vec<T, SIMD_WIDTH> &b)
1588 return add(lsb,
add(as, bs));
1592template <
typename T,
size_t SIMD_WIDTH,
1593 SIMD_ENABLE_IF(std::is_floating_point<T>::value),
typename =
void>
1594static SIMD_INLINE Vec<T, SIMD_WIDTH> avgrd(
const Vec<T, SIMD_WIDTH> &a,
1595 const Vec<T, SIMD_WIDTH> &b)
1610template <
typename T,
size_t SIMD_WIDTH>
1614 return internal::ext::avgrd(a, b);
1623template <
size_t SIMD_WIDTH>
1624static SIMD_INLINE Vec<Byte, SIMD_WIDTH> div2r0(
const Vec<Byte, SIMD_WIDTH> &a)
1630template <
size_t SIMD_WIDTH>
1631static SIMD_INLINE Vec<SignedByte, SIMD_WIDTH> div2r0(
1632 const Vec<SignedByte, SIMD_WIDTH> &a)
1638template <
size_t SIMD_WIDTH>
1639static SIMD_INLINE Vec<Word, SIMD_WIDTH> div2r0(
const Vec<Word, SIMD_WIDTH> &a)
1644template <
size_t SIMD_WIDTH>
1645static SIMD_INLINE Vec<Short, SIMD_WIDTH> div2r0(
1646 const Vec<Short, SIMD_WIDTH> &a)
1652template <
size_t SIMD_WIDTH>
1653static SIMD_INLINE Vec<Int, SIMD_WIDTH> div2r0(
const Vec<Int, SIMD_WIDTH> &a)
1659template <
size_t SIMD_WIDTH>
1660static SIMD_INLINE Vec<Long, SIMD_WIDTH> div2r0(
const Vec<Long, SIMD_WIDTH> &a)
1667template <
size_t SIMD_WIDTH>
1668static SIMD_INLINE Vec<Float, SIMD_WIDTH> div2r0(
1669 const Vec<Float, SIMD_WIDTH> &a)
1675template <
size_t SIMD_WIDTH>
1676static SIMD_INLINE Vec<Double, SIMD_WIDTH> div2r0(
1677 const Vec<Double, SIMD_WIDTH> &a)
1695template <
typename T,
size_t SIMD_WIDTH>
1698 return internal::ext::div2r0(a);
1707template <
size_t SIMD_WIDTH>
1708static SIMD_INLINE Vec<Byte, SIMD_WIDTH> div2rd(
const Vec<Byte, SIMD_WIDTH> &a)
1714template <
size_t SIMD_WIDTH>
1715static SIMD_INLINE Vec<SignedByte, SIMD_WIDTH> div2rd(
1716 const Vec<SignedByte, SIMD_WIDTH> &a)
1721template <
size_t SIMD_WIDTH>
1722static SIMD_INLINE Vec<Word, SIMD_WIDTH> div2rd(
const Vec<Word, SIMD_WIDTH> &a)
1727template <
size_t SIMD_WIDTH>
1728static SIMD_INLINE Vec<Short, SIMD_WIDTH> div2rd(
1729 const Vec<Short, SIMD_WIDTH> &a)
1734template <
size_t SIMD_WIDTH>
1735static SIMD_INLINE Vec<Int, SIMD_WIDTH> div2rd(
const Vec<Int, SIMD_WIDTH> &a)
1740template <
size_t SIMD_WIDTH>
1741static SIMD_INLINE Vec<Long, SIMD_WIDTH> div2rd(
const Vec<Long, SIMD_WIDTH> &a)
1747template <
size_t SIMD_WIDTH>
1748static SIMD_INLINE Vec<Float, SIMD_WIDTH> div2rd(
1749 const Vec<Float, SIMD_WIDTH> &a)
1755template <
size_t SIMD_WIDTH>
1756static SIMD_INLINE Vec<Double, SIMD_WIDTH> div2rd(
1757 const Vec<Double, SIMD_WIDTH> &a)
1775template <
typename T,
size_t SIMD_WIDTH>
1778 return internal::ext::div2rd(a);
1799template <
typename T,
size_t SIMD_WIDTH>
1803 static_assert(std::is_floating_point<T>::value,
1804 "sign() is only available for floating-point types");
1823template <
size_t SIMD_WIDTH>
1824static SIMD_INLINE Vec<Byte, SIMD_WIDTH> absDiff(
1825 const Vec<Byte, SIMD_WIDTH> &v1,
const Vec<Byte, SIMD_WIDTH> &v2)
1831template <
size_t SIMD_WIDTH>
1832static SIMD_INLINE Vec<Word, SIMD_WIDTH> absDiff(
1833 const Vec<Word, SIMD_WIDTH> &v1,
const Vec<Word, SIMD_WIDTH> &v2)
1841template <
typename T,
size_t SIMD_WIDTH>
1842static SIMD_INLINE Vec<T, SIMD_WIDTH> absDiff(
const Vec<T, SIMD_WIDTH> &v1,
1843 const Vec<T, SIMD_WIDTH> &v2)
1845 static_assert(std::is_signed<T>::value,
"");
1859template <
typename T,
size_t SIMD_WIDTH>
1863 return internal::ext::absDiff(v1, v2);
1877template <
size_t PART,
size_t NUM_ELEMS,
typename T,
size_t SIMD_WIDTH>
1880 static SIMD_INLINE Vec<T, SIMD_WIDTH> _unpack(
const Vec<T, SIMD_WIDTH> &a,
1881 const Vec<T, SIMD_WIDTH> &b)
1887template <
size_t PART,
size_t NUM_ELEMS,
typename T,
size_t SIMD_WIDTH>
1890 static SIMD_INLINE Vec<T, SIMD_WIDTH> _unpack(
const Vec<T, SIMD_WIDTH> &a,
1891 const Vec<T, SIMD_WIDTH> &b)
1900template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
1905 size_t INDEX,
size_t NLOHI,
size_t ELEMS>
1908 static constexpr auto PART = (NLOHI & 0x01);
1909 static constexpr auto NEXT = (NLOHI >> 1);
1910 static constexpr auto LIDX = INDEX;
1911 static constexpr auto RIDX = INDEX + ELEMS;
1912 static constexpr auto HALF = ELEMS / 2;
1914 static SIMD_INLINE Vec<T, SIMD_WIDTH> _transpose1(
1921 return Unpack<PART, ELEMS, T, SIMD_WIDTH>::_unpack(
1922 Transpose1<Unpack, T, SIMD_WIDTH, LIDX, NEXT, HALF>::_transpose1(inRows),
1923 Transpose1<Unpack, T, SIMD_WIDTH, RIDX, NEXT, HALF>::_transpose1(inRows));
1928template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
1929 size_t SIMD_WIDTH,
size_t INDEX,
size_t NLOHI>
1930struct Transpose1<Unpack, T, SIMD_WIDTH, INDEX, NLOHI, 1>
1932 static constexpr auto PART = (NLOHI & 0x01);
1934 static SIMD_INLINE Vec<T, SIMD_WIDTH> _transpose1(
1940 return Unpack<PART, 1, T, SIMD_WIDTH>::_unpack(inRows[INDEX],
1948template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
1953 size_t NUMROWS,
size_t NUM_TRANSPOSE_ROWS,
size_t ROW>
1956 static SIMD_INLINE
void _transpose(
1958 Vec<T, SIMD_WIDTH> outRows[NUM_TRANSPOSE_ROWS])
1964 Transpose1<Unpack, T, SIMD_WIDTH, 0, ROW, NUMROWS / 2>::_transpose1(
1968 Transpose<Unpack, T, SIMD_WIDTH, NUMROWS, NUM_TRANSPOSE_ROWS,
1969 ROW + 1>::_transpose(inRows, outRows);
1974template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
1975 size_t SIMD_WIDTH,
size_t NUMROWS,
size_t NUM_TRANSPOSE_ROWS>
1976struct Transpose<Unpack, T, SIMD_WIDTH, NUMROWS, NUM_TRANSPOSE_ROWS,
1979 static SIMD_INLINE
void _transpose(
1981 Vec<T, SIMD_WIDTH>[NUM_TRANSPOSE_ROWS])
1986template <
size_t NUM_TRANSPOSE_ROWS,
typename T,
size_t SIMD_WIDTH>
1987static SIMD_INLINE
void transpose_a_partial(
1989 Vec<T, SIMD_WIDTH> outRows[NUM_TRANSPOSE_ROWS])
1991 Transpose<Unpack, T, SIMD_WIDTH,
1993 SIMD_WIDTH /
sizeof(T), NUM_TRANSPOSE_ROWS, 0>::_transpose(inRows,
1998template <
typename T,
size_t SIMD_WIDTH>
1999static SIMD_INLINE
void transpose_a(
2003 transpose_a_partial<SIMD_WIDTH /
sizeof(T)>(inRows, outRows);
2011template <
typename T,
size_t SIMD_WIDTH,
size_t ROW,
size_t ROW_STOP>
2014 static_assert(ROW < ROW_STOP,
"ROW must be less than ROW_STOP");
2016 static SIMD_INLINE
void _copy(Vec<T, SIMD_WIDTH> v[ROW_STOP],
2017 Vec<T, SIMD_WIDTH> v2[ROW_STOP])
2020 CopyMatrix<T, SIMD_WIDTH, ROW + 1, ROW_STOP>::_copy(v, v2);
2025template <
typename T,
size_t SIMD_WIDTH,
size_t ROW_STOP>
2026struct CopyMatrix<T, SIMD_WIDTH, ROW_STOP, ROW_STOP>
2028 static SIMD_INLINE
void _copy(Vec<T, SIMD_WIDTH>[ROW_STOP],
2029 Vec<T, SIMD_WIDTH>[ROW_STOP])
2041template <
typename T,
size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW,
2042 size_t ROW_STOP,
size_t TRANSPOSE_WIDTH,
size_t SRC_OFF,
2044struct TransposePostprocess16
2046 static SIMD_INLINE
void _transpose(
2052 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2053 TRANSPOSE_WIDTH / 2, SRC_OFF,
2054 2 * DST_OFF>::_transpose(inRows, outRows);
2055 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2056 TRANSPOSE_WIDTH / 2,
2057 SRC_OFF + SIMD_WIDTH / TRANSPOSE_WIDTH,
2058 2 * DST_OFF + ROW_STOP>::_transpose(inRows, outRows);
2063template <
typename T,
size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW,
2064 size_t ROW_STOP,
size_t SRC_OFF,
size_t DST_OFF>
2065struct TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP, 16,
2068 static constexpr auto STEP = SIMD_WIDTH / 16;
2069 static constexpr auto SRC_ROW = SRC_OFF + ROW * STEP;
2070 static constexpr auto DST_ROW = DST_OFF + ROW;
2072 static SIMD_INLINE
void _transpose(
2079 outRows[DST_ROW] = inRows[SRC_ROW];
2080 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW + 1, ROW_STOP, 16,
2081 SRC_OFF, DST_OFF>::_transpose(inRows, outRows);
2086template <
typename T,
size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW_STOP,
2087 size_t SRC_OFF,
size_t DST_OFF>
2088struct TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, ROW_STOP, ROW_STOP, 16,
2091 static SIMD_INLINE
void _transpose(
2100template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2102struct TransposePostprocess
2104 static SIMD_INLINE
void _transpose(
2111template <
typename T,
size_t SIMD_WIDTH>
2112struct TransposePostprocess<Unpack16, T, SIMD_WIDTH>
2114 static constexpr auto NUMROWS = SIMD_WIDTH /
sizeof(T);
2115 static constexpr auto ROW_STOP = 16 /
sizeof(T);
2117 static SIMD_INLINE
void _transpose(
2123 TransposePostprocess16<T, SIMD_WIDTH, NUMROWS, 0, ROW_STOP, SIMD_WIDTH, 0,
2124 0>::_transpose(inRows, outRows);
2135template <
typename T,
size_t SIMD_WIDTH>
2136static SIMD_INLINE
void transpose_b(
2141 Transpose<Unpack16, T, SIMD_WIDTH, SIMD_WIDTH /
sizeof(T),
2142 SIMD_WIDTH /
sizeof(T), 0>::_transpose(inRows, tempRows);
2143 TransposePostprocess<Unpack16, T, SIMD_WIDTH>::_transpose(tempRows, outRows);
2153template <
typename T,
size_t SIMD_WIDTH>
2154static SIMD_INLINE
void transpose_c(
2160 Transpose<Unpack16, T, SIMD_WIDTH, SIMD_WIDTH /
sizeof(T),
2161 SIMD_WIDTH /
sizeof(T), 0>::_transpose(inRows, tempOutRows);
2164 const auto N = SIMD_WIDTH /
sizeof(T);
2166 storeu16(outArray, tempOutRows);
2168 loadu(outArray, outRows, N);
2180template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2184 size_t NUMROWS,
size_t ROW,
size_t ROW_STOP,
size_t TRANSPOSE_WIDTH,
2185 size_t SRC_OFF,
size_t DST_OFF>
2188 static SIMD_INLINE
void _transpose(
2192 Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2193 TRANSPOSE_WIDTH / 2, SRC_OFF, 2 * DST_OFF>::_transpose(inRows,
2195 Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP,
2196 TRANSPOSE_WIDTH / 2, SRC_OFF + SIMD_WIDTH / TRANSPOSE_WIDTH,
2197 2 * DST_OFF + ROW_STOP>::_transpose(inRows, outRows);
2202template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2203 size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW,
size_t ROW_STOP,
2204 size_t SRC_OFF,
size_t DST_OFF>
2205struct Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW, ROW_STOP, 16, SRC_OFF,
2208 static constexpr auto STEP = SIMD_WIDTH / 16;
2209 static constexpr auto SRC_ROW = SRC_OFF + ROW * STEP;
2210 static constexpr auto DST_ROW = DST_OFF + ROW;
2212 static SIMD_INLINE
void _transpose(
2219 outRows[DST_ROW] = Transpose1<Unpack, T, SIMD_WIDTH,
2221 0, SRC_ROW, NUMROWS / 2>::_transpose1(inRows);
2224 Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW + 1, ROW_STOP, 16, SRC_OFF,
2225 DST_OFF>::_transpose(inRows, outRows);
2230template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2231 size_t SIMD_WIDTH,
size_t NUMROWS,
size_t ROW_STOP,
size_t SRC_OFF,
2233struct Transpose16<Unpack, T, SIMD_WIDTH, NUMROWS, ROW_STOP, ROW_STOP, 16,
2236 static SIMD_INLINE
void _transpose(
2244template <
typename T,
size_t SIMD_WIDTH>
2245static SIMD_INLINE
void transpose_d(
2249 Transpose16<Unpack16, T, SIMD_WIDTH,
2251 SIMD_WIDTH /
sizeof(T), 0, 16 /
sizeof(T),
2253 SIMD_WIDTH, 0, 0>::_transpose(inRows, outRows);
2272template <
size_t N,
size_t FINALBLKSIZE,
typename T,
size_t SIMD_WIDTH>
2273static SIMD_INLINE
void swizzle2_a(Vec<T, SIMD_WIDTH> v[2 * N])
2275 Vec<T, SIMD_WIDTH> v2[2 * N];
2276 for (
size_t blkSize = 1; blkSize <= FINALBLKSIZE; blkSize *= 2) {
2278 for (
size_t src = 0, dst = 0; src < N; src++, dst += 2)
2279 zip<1>(v[src], v[src + N], v2[dst], v2[dst + 1]);
2282 for (
size_t i = 0; i < 2 * N; i++) v[i] = v2[i];
2286template <
size_t N,
typename T,
size_t SIMD_WIDTH>
2287static SIMD_INLINE
void swizzle2_a(Vec<T, SIMD_WIDTH> v[2 * N])
2289 swizzle2_a<N, Vec<T, SIMD_WIDTH>::elements>(v);
2298template <
typename T,
size_t SIMD_WIDTH>
2299static SIMD_INLINE
void transpose_e(
2304 for (
size_t i = 0; i < num; i++) outRows[i] = inRows[i];
2305 swizzle2_a<num / 2, num / 2>(outRows);
2320template <
size_t N,
size_t FINALBLKSIZE,
typename T,
size_t SIMD_WIDTH>
2321static SIMD_INLINE
void swizzle2_b(Vec<T, SIMD_WIDTH> v[2 * N])
2323 Vec<T, SIMD_WIDTH> v2[2 * N];
2324 const auto origReps = floorlog2(FINALBLKSIZE) + 1;
2325 const auto finalReps = origReps / 2;
2328 for (
size_t rep = 0; rep < finalReps; rep++) {
2330 for (
size_t src = 0, dst = 0; src < N; src++, dst += 2)
2331 zip<1>(v[src], v[src + N], v2[dst], v2[dst + 1]);
2334 for (
size_t src = 0, dst = 0; src < N; src++, dst += 2)
2335 zip<1>(v2[src], v2[src + N], v[dst], v[dst + 1]);
2339 if (origReps % 2 == 0)
return;
2342 for (
size_t src = 0, dst = 0; src < N; src++, dst += 2)
2343 zip<1>(v[src], v[src + N], v2[dst], v2[dst + 1]);
2346 for (
size_t i = 0; i < 2 * N; i++) v[i] = v2[i];
2349template <
size_t N,
typename T,
size_t SIMD_WIDTH>
2350static SIMD_INLINE
void swizzle2_b(Vec<T, SIMD_WIDTH> v[2 * N])
2352 swizzle2_b<N, Vec<T, SIMD_WIDTH>::elements>(v);
2361template <
typename T,
size_t SIMD_WIDTH>
2362static SIMD_INLINE
void transpose_f(
2367 for (
size_t i = 0; i < elems; i++) outRows[i] = inRows[i];
2368 swizzle2_b<elems / 2, elems / 2>(outRows);
2379template <
size_t NUM_ELEMS,
typename T,
size_t SIMD_WIDTH>
2382 static SIMD_INLINE
void _zip(Vec<T, SIMD_WIDTH> a, Vec<T, SIMD_WIDTH> b,
2383 Vec<T, SIMD_WIDTH> &l, Vec<T, SIMD_WIDTH> &h)
2389template <
size_t NUM_ELEMS,
typename T,
size_t SIMD_WIDTH>
2392 static SIMD_INLINE
void _zip(Vec<T, SIMD_WIDTH> a, Vec<T, SIMD_WIDTH> b,
2393 Vec<T, SIMD_WIDTH> &l, Vec<T, SIMD_WIDTH> &h)
2402template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2403 size_t SIMD_WIDTH,
size_t N,
size_t SRC,
size_t DST>
2406 static constexpr auto SRC2 = SRC + N;
2407 static constexpr auto DST2 = DST + 1;
2409 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2410 Vec<T, SIMD_WIDTH> v2[2 * N])
2414 Zip<1, T, SIMD_WIDTH>::_zip(v[SRC], v[SRC2], v2[DST], v2[DST2]);
2415 Swizzle2Once<Zip, T, SIMD_WIDTH, N, SRC + 1, DST + 2>::_swizzle(v, v2);
2420template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2421 size_t SIMD_WIDTH,
size_t N,
size_t DST>
2422struct Swizzle2Once<Zip, T, SIMD_WIDTH, N, N, DST>
2424 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH>[2 * N] ,
2425 Vec<T, SIMD_WIDTH>[2 * N] )
2437template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2438 size_t SIMD_WIDTH,
size_t N,
size_t REP,
size_t FINAL_REPS,
2440struct Swizzle2Multiple
2442 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2443 Vec<T, SIMD_WIDTH> v2[2 * N])
2447 Swizzle2Once<Zip, T, SIMD_WIDTH, N, 0, 0>::_swizzle(v, v2);
2448 Swizzle2Once<Zip, T, SIMD_WIDTH, N, 0, 0>::_swizzle(v2, v);
2449 Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, REP + 1, FINAL_REPS, ODD>::_swizzle(
2455template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2456 size_t SIMD_WIDTH,
size_t N,
size_t FINAL_REPS,
size_t ODD>
2457struct Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, ODD>
2459 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH>[2 * N],
2460 Vec<T, SIMD_WIDTH>[2 * N])
2465template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2466 size_t SIMD_WIDTH,
size_t N,
size_t FINAL_REPS>
2467struct Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, 1>
2469 static constexpr auto ROW_STOP = 2 * N;
2471 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2472 Vec<T, SIMD_WIDTH> v2[2 * N])
2475 Swizzle2Once<Zip, T, SIMD_WIDTH, N, 0, 0>::_swizzle(v, v2);
2476 CopyMatrix<T, SIMD_WIDTH, 0, ROW_STOP>::_copy(v2, v);
2486template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2487 size_t SIMD_WIDTH,
size_t N,
size_t FINALBLKSIZE>
2490 static constexpr auto ORIG_REPS = floorlog2(FINALBLKSIZE) + 1;
2491 static constexpr auto FINAL_REPS = ORIG_REPS / 2;
2492 static constexpr auto ODD = (ORIG_REPS & 0x01);
2494 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N])
2496 Vec<T, SIMD_WIDTH> v2[2 * N];
2501 Swizzle2Multiple<Zip, T, SIMD_WIDTH, N, 0, FINAL_REPS, ODD>::_swizzle(v,
2512template <
size_t N,
typename T,
size_t SIMD_WIDTH>
2513static SIMD_INLINE
void swizzle2_c(Vec<T, SIMD_WIDTH> v[2 * N])
2515 Swizzle2<Zip, T, SIMD_WIDTH, N, Vec<T, SIMD_WIDTH>::elements>::_swizzle(v);
2531template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t SRC,
size_t DST>
2534 static SIMD_INLINE
void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2535 Vec<T, SIMD_WIDTH> v2[2 * N])
2537 unzip<1, T>(v[SRC], v[SRC + 1], v2[DST], v2[DST + N]);
2538 UnswizzleOnce<T, SIMD_WIDTH, N, SRC + 2, DST + 1>::_unswizzle(v, v2);
2543template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t SRC>
2544struct UnswizzleOnce<T, SIMD_WIDTH, N, SRC, N>
2546 static SIMD_INLINE
void _unswizzle(Vec<T, SIMD_WIDTH>[2 * N],
2547 Vec<T, SIMD_WIDTH>[2 * N])
2551template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t REP,
2552 size_t FINAL_REPS,
size_t ODD>
2553struct UnswizzleMultiple
2555 static SIMD_INLINE
void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2556 Vec<T, SIMD_WIDTH> v2[2 * N])
2558 UnswizzleOnce<T, SIMD_WIDTH, N, 0, 0>::_unswizzle(v, v2);
2559 UnswizzleOnce<T, SIMD_WIDTH, N, 0, 0>::_unswizzle(v2, v);
2560 UnswizzleMultiple<T, SIMD_WIDTH, N, REP + 1, FINAL_REPS, ODD>::_unswizzle(
2566template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t FINAL_REPS,
2568struct UnswizzleMultiple<T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, ODD>
2570 static SIMD_INLINE
void _unswizzle(Vec<T, SIMD_WIDTH>[2 * N],
2571 Vec<T, SIMD_WIDTH>[2 * N])
2576template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t FINAL_REPS>
2577struct UnswizzleMultiple<T, SIMD_WIDTH, N, FINAL_REPS, FINAL_REPS, 1>
2579 static SIMD_INLINE
void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2580 Vec<T, SIMD_WIDTH> v2[2 * N])
2582 UnswizzleOnce<T, SIMD_WIDTH, N, 0, 0>::_unswizzle(v, v2);
2583 CopyMatrix<T, SIMD_WIDTH, 0, 2 * N>::_copy(v2, v);
2589template <
typename T,
size_t SIMD_WIDTH,
size_t N>
2593 static constexpr auto ORIG_REPS = floorlog2(FINALBLKSIZE) + 1;
2594 static constexpr auto FINAL_REPS = ORIG_REPS / 2;
2595 static constexpr auto ODD = (ORIG_REPS & 0x01);
2597 static SIMD_INLINE
void _unswizzle(Vec<T, SIMD_WIDTH> v[2 * N])
2599 Vec<T, SIMD_WIDTH> v2[2 * N];
2600 UnswizzleMultiple<T, SIMD_WIDTH, N, 0, FINAL_REPS, ODD>::_unswizzle(v, v2);
2610template <
size_t N,
typename T,
size_t SIMD_WIDTH>
2611static SIMD_INLINE
void unswizzle_b(Vec<T, SIMD_WIDTH> v[2 * N])
2613 Unswizzle<T, SIMD_WIDTH, N>::_unswizzle(v);
2622template <
typename T,
size_t SIMD_WIDTH>
2623static SIMD_INLINE
void transpose_g(
2628 for (
size_t i = 0; i < elems; i++) outRows[i] = inRows[i];
2629 Swizzle2<Zip, T, SIMD_WIDTH, elems / 2, elems / 2>::_swizzle(outRows);
2641template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t SRC,
size_t DST,
2643struct Swizzle2Postprocess16Once
2645 static constexpr auto SRC2 = SRC + 1;
2646 static constexpr auto DST2 = DST + N;
2648 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2649 Vec<T, SIMD_WIDTH> v2[2 * N])
2653 Zip16<LANE_ELEMS, T, SIMD_WIDTH>::_zip(v[SRC], v[SRC2], v2[DST], v2[DST2]);
2654 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, SRC + 2, DST + 1,
2655 LANE_ELEMS>::_swizzle(v, v2);
2660template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t SRC,
2662struct Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, SRC, N, LANE_ELEMS>
2664 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH>[2 * N] ,
2665 Vec<T, SIMD_WIDTH>[2 * N] )
2677template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t LANE_ELEMS,
2678 size_t REP,
size_t FINAL_REPS,
size_t ODD>
2679struct Swizzle2Postprocess16
2681 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2682 Vec<T, SIMD_WIDTH> v2[2 * N])
2687 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, 0, 0, LANE_ELEMS>::_swizzle(v,
2689 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, 0, 0, LANE_ELEMS * 2>::_swizzle(
2691 Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS * 4, REP + 1, FINAL_REPS,
2692 ODD>::_swizzle(v, v2);
2697template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t LANE_ELEMS,
2698 size_t FINAL_REPS,
size_t ODD>
2699struct Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS, FINAL_REPS,
2702 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH>[2 * N],
2703 Vec<T, SIMD_WIDTH>[2 * N])
2708template <
typename T,
size_t SIMD_WIDTH,
size_t N,
size_t LANE_ELEMS,
2710struct Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS, FINAL_REPS,
2713 static constexpr auto ROW_STOP = 2 * N;
2715 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N],
2716 Vec<T, SIMD_WIDTH> v2[2 * N])
2718 Swizzle2Postprocess16Once<T, SIMD_WIDTH, N, 0, 0, LANE_ELEMS>::_swizzle(v,
2720 CopyMatrix<T, SIMD_WIDTH, 0, ROW_STOP>::_copy(v2, v);
2727template <
template <
size_t,
typename,
size_t>
class Zip,
typename T,
2728 size_t SIMD_WIDTH,
size_t N>
2729struct Swizzle2Postprocess
2731 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH>[2 * N]) {}
2735template <
typename T,
size_t SIMD_WIDTH,
size_t N>
2736struct Swizzle2Postprocess<Zip16, T, SIMD_WIDTH, N>
2738 static constexpr auto ORIG_REPS = floorlog2(SIMD_WIDTH) - 4;
2739 static constexpr auto FINAL_REPS = ORIG_REPS / 2;
2740 static constexpr auto ODD = (ORIG_REPS & 0x01);
2741 static constexpr auto LANE_ELEMS = 16 /
sizeof(T);
2743 static SIMD_INLINE
void _swizzle(Vec<T, SIMD_WIDTH> v[2 * N])
2745 Vec<T, SIMD_WIDTH> v2[2 * N];
2746 Swizzle2Postprocess16<T, SIMD_WIDTH, N, LANE_ELEMS, 0, FINAL_REPS,
2747 ODD>::_swizzle(v, v2);
2755template <
typename T,
size_t SIMD_WIDTH>
2756static SIMD_INLINE
void transpose_h(
2761 for (
size_t i = 0; i < elems; i++) outRows[i] = inRows[i];
2762 Swizzle2<Zip16, T, SIMD_WIDTH, elems / 2, elems / 2>::_swizzle(outRows);
2763 Swizzle2Postprocess<Zip16, T, SIMD_WIDTH, elems / 2>::_swizzle(outRows);
2773template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2774 size_t SIMD_WIDTH,
size_t PROCESS_ROW,
size_t PROCESS_ROWS,
2775 size_t UNPACK_ELEMS,
size_t UNPACK_REP,
size_t UNPACK_REPS,
2776 size_t SUB_BASE,
size_t SUB>
2777struct TransposeRcUnpackSingle
2779 static constexpr auto UNPACK_PART =
2780 (PROCESS_ROW >> (UNPACK_REPS - UNPACK_REP - 1)) & 0x01;
2781 static constexpr auto UNPACK_PART_NEXT =
2782 ((PROCESS_ROW + 1) >> (UNPACK_REPS - UNPACK_REP - 1)) & 0x01;
2783 static constexpr auto SRC1 = (PROCESS_ROW - SUB) * 2;
2784 static constexpr auto SRC2 = (PROCESS_ROW - SUB) * 2 + 1;
2786 static SIMD_INLINE
void _transpose(
2787 const Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS],
2788 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2793 outRows[PROCESS_ROW] =
2794 Unpack<UNPACK_PART, UNPACK_ELEMS, T, SIMD_WIDTH>::_unpack(inRows[SRC1],
2796 TransposeRcUnpackSingle<
2797 Unpack, T, SIMD_WIDTH, PROCESS_ROW + 1, PROCESS_ROWS, UNPACK_ELEMS,
2798 UNPACK_REP, UNPACK_REPS, SUB_BASE,
2799 SUB + (UNPACK_PART_NEXT == 1 && (PROCESS_ROW + 1) % SUB_BASE == 0 ?
2801 0)>::_transpose(inRows, outRows);
2806template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2807 size_t SIMD_WIDTH,
size_t PROCESS_ROWS,
size_t UNPACK_ELEMS,
2808 size_t UNPACK_REP,
size_t UNPACK_REPS,
size_t SUB_BASE,
size_t SUB>
2809struct TransposeRcUnpackSingle<Unpack, T, SIMD_WIDTH, PROCESS_ROWS,
2810 PROCESS_ROWS, UNPACK_ELEMS, UNPACK_REP,
2811 UNPACK_REPS, SUB_BASE, SUB>
2813 static SIMD_INLINE
void _transpose(
2814 const Vec<T, SIMD_WIDTH>[PROCESS_ROWS] ,
2815 Vec<T, SIMD_WIDTH>[PROCESS_ROWS] )
2829template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2830 size_t SIMD_WIDTH,
size_t UNPACK_REP,
size_t UNPACK_REPS,
2831 size_t PROCESS_ROWS,
size_t UNPACK_ELEMS,
size_t SUB_BASE,
2833struct TransposeRcUnpackMultiple
2835 static SIMD_INLINE
void _transpose(Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS],
2836 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2840 TransposeRcUnpackSingle<Unpack, T, SIMD_WIDTH, 0, PROCESS_ROWS,
2841 UNPACK_ELEMS, UNPACK_REP, UNPACK_REPS, SUB_BASE,
2842 0>::_transpose(inRows, outRows);
2843 TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, UNPACK_REP + 1,
2844 UNPACK_REPS, PROCESS_ROWS, UNPACK_ELEMS * 2,
2845 SUB_BASE / 2, UNPACK_ODD>::_transpose(outRows,
2851template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2852 size_t SIMD_WIDTH,
size_t UNPACK_REPS,
size_t PROCESS_ROWS,
2853 size_t UNPACK_ELEMS,
size_t SUB_BASE,
size_t UNPACK_ODD>
2854struct TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, UNPACK_REPS,
2855 UNPACK_REPS, PROCESS_ROWS, UNPACK_ELEMS,
2856 SUB_BASE, UNPACK_ODD>
2858 static SIMD_INLINE
void _transpose(Vec<T, SIMD_WIDTH>[PROCESS_ROWS],
2859 Vec<T, SIMD_WIDTH>[PROCESS_ROWS])
2864template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2865 size_t SIMD_WIDTH,
size_t UNPACK_REPS,
size_t PROCESS_ROWS,
2866 size_t UNPACK_ELEMS,
size_t SUB_BASE>
2867struct TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, UNPACK_REPS,
2868 UNPACK_REPS, PROCESS_ROWS, UNPACK_ELEMS,
2871 static SIMD_INLINE
void _transpose(Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS],
2872 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2876 CopyMatrix<T, SIMD_WIDTH, 0, PROCESS_ROWS>::_copy(inRows, outRows);
2883template <
typename T,
size_t SIMD_WIDTH,
size_t PROCESS_REP,
2884 size_t PROCESS_REPS,
size_t PROCESS_ROWS,
size_t UNPACK_REPS,
2885 size_t STORE_OFF,
size_t VO,
size_t LANE>
2886struct TransposeRcStoreLane
2890 static SIMD_INLINE
void _store(
2892 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2895 TransposeRcStoreLane<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS, PROCESS_ROWS,
2896 UNPACK_REPS, STORE_OFF + PROCESS_ROWS * VEC_ELEMS_OUT,
2897 VO, LANE + 1>::_store(outArray, outRows);
2902template <
typename T,
size_t SIMD_WIDTH,
size_t PROCESS_REP,
2903 size_t PROCESS_REPS,
size_t PROCESS_ROWS,
size_t UNPACK_REPS,
2904 size_t STORE_OFF,
size_t VO>
2905struct TransposeRcStoreLane<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2906 PROCESS_ROWS, UNPACK_REPS, STORE_OFF, VO,
2909 static SIMD_INLINE
void _store(
2911 Vec<T, SIMD_WIDTH>[PROCESS_ROWS])
2918template <
typename T,
size_t SIMD_WIDTH,
size_t PROCESS_REP,
2919 size_t PROCESS_REPS,
size_t PROCESS_ROWS,
size_t UNPACK_REPS,
2920 size_t STORE_OFF,
size_t VO>
2921struct TransposeRcStoreLanes
2925 static SIMD_INLINE
void _store(
2927 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2929 TransposeRcStoreLane<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS, PROCESS_ROWS,
2930 UNPACK_REPS, STORE_OFF, VO, 0>::_store(outArray,
2932 TransposeRcStoreLanes<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2933 PROCESS_ROWS, UNPACK_REPS, STORE_OFF + VEC_ELEMS_OUT,
2934 VO + 1>::_store(outArray, outRows);
2939template <
typename T,
size_t SIMD_WIDTH,
size_t PROCESS_REP,
2940 size_t PROCESS_REPS,
size_t PROCESS_ROWS,
size_t UNPACK_REPS,
2942struct TransposeRcStoreLanes<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2943 PROCESS_ROWS, UNPACK_REPS, STORE_OFF, PROCESS_ROWS>
2945 static SIMD_INLINE
void _store(
2947 Vec<T, SIMD_WIDTH>[PROCESS_ROWS])
2955template <
typename T,
size_t SIMD_WIDTH,
size_t PROCESS_REP,
2956 size_t PROCESS_REPS,
size_t PROCESS_ROWS,
size_t UNPACK_REPS>
2957struct TransposeRcStore
2959 static constexpr auto ELEMS_PER_LANE = 16 /
sizeof(T);
2960 static constexpr auto STORE_OFF = PROCESS_REP * ELEMS_PER_LANE;
2962 static SIMD_INLINE
void _store(
2964 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2966 TransposeRcStoreLanes<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS,
2967 PROCESS_ROWS, UNPACK_REPS, STORE_OFF,
2968 0>::_store(outArray, outRows);
2972template <
typename T,
size_t SIMD_WIDTH,
size_t PROCESS_REP,
2973 size_t PROCESS_ROWS,
size_t UNPACK_REPS>
2974struct TransposeRcStore<T, SIMD_WIDTH, PROCESS_REP, 1, PROCESS_ROWS,
2977 static SIMD_INLINE
void _store(
2979 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS])
2982 Store16<Store, T, SIMD_WIDTH, PROCESS_ROWS, 0, 16 /
sizeof(T), SIMD_WIDTH,
2983 0, 0>::_store16(outArray, outRows);
2991template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
2992 size_t SIMD_WIDTH,
size_t PROCESS_REP,
size_t PROCESS_REPS,
2993 size_t PROCESS_ROWS,
size_t UNPACK_REPS,
size_t UNPACK_ODD>
2994struct TransposeRcRep
2996 static constexpr auto LOAD_OFF =
2997 PROCESS_REP * PROCESS_ROWS * SIMD_WIDTH /
sizeof(T);
2998 static constexpr auto SUB_BASE = 1 << (floorlog2(PROCESS_ROWS) - 1);
3000 static SIMD_INLINE
void _transpose(
3007 Vec<T, SIMD_WIDTH> inRows[PROCESS_ROWS];
3008 Vec<T, SIMD_WIDTH> outRows[PROCESS_ROWS];
3009 load(inArray + LOAD_OFF, inRows, PROCESS_ROWS);
3015 TransposeRcUnpackMultiple<Unpack, T, SIMD_WIDTH, 0, UNPACK_REPS,
3016 PROCESS_ROWS, 1, SUB_BASE,
3017 UNPACK_ODD>::_transpose(inRows, outRows);
3023 TransposeRcStore<T, SIMD_WIDTH, PROCESS_REP, PROCESS_REPS, PROCESS_ROWS,
3024 UNPACK_REPS>::_store(outArray, outRows);
3025 TransposeRcRep<Unpack, T, SIMD_WIDTH, PROCESS_REP + 1, PROCESS_REPS,
3026 PROCESS_ROWS, UNPACK_REPS, UNPACK_ODD>::_transpose(inArray,
3032template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
3033 size_t SIMD_WIDTH,
size_t PROCESS_REPS,
size_t PROCESS_ROWS,
3034 size_t UNPACK_REPS,
size_t UNPACK_ODD>
3035struct TransposeRcRep<Unpack, T, SIMD_WIDTH, PROCESS_REPS, PROCESS_REPS,
3036 PROCESS_ROWS, UNPACK_REPS, UNPACK_ODD>
3038 static SIMD_INLINE
void _transpose(
3047template <
template <
size_t,
size_t,
typename,
size_t>
class Unpack,
typename T,
3051 static constexpr auto SIMD_REGS = NATIVE_SIMD_REG_COUNT / 2;
3052 static constexpr auto NUM_ROWS = SIMD_WIDTH /
sizeof(T);
3053 static constexpr auto PROCESS_REPS =
3054 NUM_ROWS > SIMD_REGS ? SIMD_WIDTH / 16 : 1;
3055 static constexpr auto PROCESS_ROWS = NUM_ROWS / PROCESS_REPS;
3056 static constexpr auto UNPACK_REPS =
3057 PROCESS_REPS == 1 ? floorlog2(PROCESS_ROWS) : floorlog2(16 / sizeof(T));
3058 static constexpr auto UNPACK_ODD = (UNPACK_REPS & 0x01);
3060 static SIMD_INLINE
void _transpose(
3065 TransposeRcRep<Unpack, T, SIMD_WIDTH, 0, PROCESS_REPS, PROCESS_ROWS,
3066 UNPACK_REPS, UNPACK_ODD>::_transpose(inArray, outArray);
3076template <
typename T,
size_t SIMD_WIDTH>
3077static SIMD_INLINE
void transpose_i(
3088 T inArray[N * N] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
3089 T outArray[N * N] SIMD_ATTR_ALIGNED(SIMD_WIDTH);
3090 store(inArray, inRows, N);
3091 TransposeRc<Unpack16, T, SIMD_WIDTH>::_transpose(inArray, outArray);
3092 load(outArray, outRows, N);
3099template <
size_t N,
typename T,
size_t SIMD_WIDTH>
3100static SIMD_INLINE
void unswizzle_a(Vec<T, SIMD_WIDTH> v[2 * N])
3103 Vec<T, SIMD_WIDTH> v2[2 * N];
3104 for (
size_t blkSize = 1; blkSize <= finalBlkSize; blkSize *= 2) {
3106 for (
size_t dst = 0, src = 0; dst < N; dst++, src += 2)
3107 unzip<1>(v[src], v[src + 1], v2[dst], v2[dst + N]);
3110 for (
size_t i = 0; i < 2 * N; i++) v[i] = v2[i];
3159template <
size_t N,
typename T,
size_t SIMD_WIDTH>
3165 internal::ext::swizzle2_c<N>(v);
3202template <
size_t N,
typename T,
size_t SIMD_WIDTH>
3207 internal::ext::unswizzle_b<N>(v);
3222template <
typename T,
size_t SIMD_WIDTH>
3244 internal::ext::transpose1inplcLane(inRows, outRows);
3259template <
typename T,
size_t SIMD_WIDTH>
3267 internal::ext::transpose1inplcLane(rows);
3292template <
typename T,
size_t SIMD_WIDTH,
int SHIFT,
int END_SHIFT>
3299 return HInt<T, SIMD_WIDTH, 2 * SHIFT, END_SHIFT>::integrate(
3305template <
typename T,
size_t SIMD_WIDTH,
int END_SHIFT>
3306struct HInt<T, SIMD_WIDTH, END_SHIFT, END_SHIFT>
3329template <
typename T,
size_t SIMD_WIDTH>
3333 return internal::ext::HInt<T, SIMD_WIDTH, 1,
3347template <
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3351 return cmpeq(zero, zero);
3367template <
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3380template <
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3392template <
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3406template <
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3409 static_assert(std::is_signed<T>::value || std::is_floating_point<T>::value,
3410 "setnegunity() only available for signed integer and floating "
3429template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3433template <
typename T,
size_t SIMD_WIDTH>
3434struct Cas<
SortSlope::DESCENDING, T, SIMD_WIDTH>
3436 static void compareAndSwap(Vec<T, SIMD_WIDTH> &a, Vec<T, SIMD_WIDTH> &b)
3438 Vec<T, SIMD_WIDTH> temp =
min(a, b);
3445template <
typename T,
size_t SIMD_WIDTH>
3446struct Cas<
SortSlope::ASCENDING, T, SIMD_WIDTH>
3448 static void compareAndSwap(Vec<T, SIMD_WIDTH> &a, Vec<T, SIMD_WIDTH> &b)
3450 Vec<T, SIMD_WIDTH> temp =
max(a, b);
3459template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3460static SIMD_INLINE
void bitonicSortTransposed(
3465 for (
size_t blkSize = 2; blkSize <= numVecs; blkSize *= 2) {
3470 for (
size_t blkStart = 0; blkStart < numVecs; blkStart += blkSize) {
3471 size_t halfBlk = blkSize / 2;
3472 size_t leftCounter = blkStart;
3473 size_t rightCounter = blkStart + (blkSize - 1);
3475 for (
size_t i = 0; i < halfBlk; i++) {
3476 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(transVecs[leftCounter],
3477 transVecs[rightCounter]);
3487 for (
size_t step = blkSize / 4; step > 0; step /= 2) {
3489 for (
size_t jump = 0; jump < blkSize; jump += step * 2) {
3490 leftCounter = blkStart + jump;
3491 rightCounter = blkStart + jump + step;
3493 for (
size_t k = 0; k < step; k++) {
3494 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(transVecs[leftCounter],
3495 transVecs[rightCounter]);
3511template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3512static SIMD_INLINE
void bitonicSortReducedTransposed(
3516 for (
size_t step = numVecs / 2; step > 0; step /= 2) {
3518 for (
size_t jump = 0; jump < numVecs; jump += step * 2) {
3519 size_t leftCounter = jump;
3520 size_t rightCounter = jump + step;
3522 for (
size_t k = 0; k < step; k++) {
3523 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(transVecs[leftCounter],
3524 transVecs[rightCounter]);
3533template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3534static SIMD_INLINE
void bitonicSortReduced(
3539 internal::ext::bitonicSortReducedTransposed<SLOPE>(transVecs);
3553template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3559 internal::ext::bitonicSortTransposed<SLOPE>(transVecs);
3568template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3569static SIMD_INLINE
void bitonicFusion(Vec<T, SIMD_WIDTH> &a,
3570 Vec<T, SIMD_WIDTH> &b)
3573 Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(a, b);
3589template <SortSlope SLOPE,
typename T,
size_t SIMD_WIDTH_DEFAULT_NATIVE>
3595 for (
size_t i = 0; i < Vec<T, SIMD_WIDTH>::elements; i += 2)
3596 internal::ext::bitonicFusion<SLOPE>(vecs[i], vecs[i + 1]);
3600 internal::ext::bitonicSortReduced<SLOPE>(vecs);
3617template <
size_t LENGTH,
SortSlope SLOPE,
typename T,
3618 size_t SIMD_WIDTH_DEFAULT_NATIVE>
3626 constexpr size_t NUM_VECS = LENGTH / SIMD_ELEMS;
3629 constexpr size_t SORTING_STAGES = NUM_VECS / SIMD_ELEMS;
3639 static_assert(SORTING_STAGES * SIMD_ELEMS * SIMD_ELEMS == LENGTH,
3640 "LENGTH is not 2^n * SIMD_ELEMS^2");
3647 for (
size_t i = 0; i < SORTING_STAGES; i++) {
3654 for (
size_t bulk_size = 2; bulk_size <= NUM_VECS; bulk_size *= 2) {
3657 for (
size_t bulk_start = 0; bulk_start < NUM_VECS;
3658 bulk_start += bulk_size) {
3659 size_t half_bulk = bulk_size / 2;
3660 size_t left_counter = bulk_start;
3661 size_t right_counter = bulk_start + (bulk_size - 1);
3662 for (
size_t i = 0; i < half_bulk; i++) {
3663 internal::ext::bitonicFusion<SLOPE>(vecs[left_counter],
3664 vecs[right_counter]);
3670 for (
size_t bulk_start = 0; bulk_start < NUM_VECS;
3671 bulk_start += bulk_size) {
3672 for (
size_t step = bulk_size / 4; step > 0; step /= 2) {
3673 for (
size_t jump = 0; jump < bulk_size; jump += step * 2) {
3674 size_t left_counter = bulk_start + jump;
3675 size_t right_counter = bulk_start + jump + step;
3676 for (
size_t k = 0; k < step; k++) {
3677 internal::ext::Cas<SLOPE, T, SIMD_WIDTH>::compareAndSwap(
3678 vecs[left_counter], vecs[right_counter]);
3686 for (
size_t i = 0; i < SORTING_STAGES; i++) {
3695 internal::ext::bitonicSortReduced<SLOPE>(current_vecs);
3710#define SIMDVEC_BINOPEQ(OP, FCT) \
3711 template <typename T, size_t SIMD_WIDTH> \
3712 static SIMD_INLINE Vec<T, SIMD_WIDTH> OP(Vec<T, SIMD_WIDTH> &a, \
3713 const Vec<T, SIMD_WIDTH> &b) \
3719#define SIMDVEC_BINOP(OP, FCT) \
3720 template <typename T, size_t SIMD_WIDTH> \
3721 static SIMD_INLINE Vec<T, SIMD_WIDTH> OP(const Vec<T, SIMD_WIDTH> &a, \
3722 const Vec<T, SIMD_WIDTH> &b) \
3727#define SIMDVEC_UNOP(OP, FCT) \
3728 template <typename T, size_t SIMD_WIDTH> \
3729 static SIMD_INLINE Vec<T, SIMD_WIDTH> OP(const Vec<T, SIMD_WIDTH> &a) \
3763SIMDVEC_BINOPEQ(operator*=,
mul)
3765SIMDVEC_BINOPEQ(operator/=,
div)
3782SIMDVEC_BINOP(operator<=,
cmple)
3784SIMDVEC_BINOP(operator<,
cmplt)
Iterative horizontal accumulator with store of the result. Calculates the horizontal accumulation of ...
Definition ext.H:1395
void push(Vec< T, SIMD_WIDTH > v)
Pushes the next Vec to be horizontally accumulated. Stores the result of the horizontal accumulation ...
Definition ext.H:1419
void finish()
Finishes the horizontal accumulation and stores the result of the horizontal accumulation into memory...
Definition ext.H:1434
HAccStore(T *const p)
Constructs a new HAccStore object.
Definition ext.H:1408
SIMD vector class, holds multiple elements of the same type.
Definition vec.H:75
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
static Vec< T, SIMD_WIDTH > sign(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Negates the elements of a Vec of floating-point numbers where the corresponding element of a second V...
Definition ext.H:1800
static Vec< T, SIMD_WIDTH > avgrd(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vecs, rounding down.
Definition ext.H:1611
static Vec< T, SIMD_WIDTH > sub(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's.
Definition base.H:388
static Vec< T, SIMD_WIDTH > subs(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Subtracts the elements of two Vec's using saturated arithmetic.
Definition base.H:405
static Vec< T, SIMD_WIDTH > avg(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vec's, rounded up.
Definition base.H:456
static Vec< T, SIMD_WIDTH > div2rd(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds down the result.
Definition ext.H:1776
static Vec< T, SIMD_WIDTH > adds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's using saturated arithmetic.
Definition base.H:374
static Vec< T, SIMD_WIDTH > div2r0(const Vec< T, SIMD_WIDTH > &a)
Divides all elements of a Vec by 2 and rounds the result to 0.
Definition ext.H:1696
static Vec< T, SIMD_WIDTH > div(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Divides the elements of two Vec's.
Definition base.H:439
static Vec< T, SIMD_WIDTH > absDiff(const Vec< T, SIMD_WIDTH > &v1, const Vec< T, SIMD_WIDTH > &v2)
Computes the absolute difference of the elements of two Vec's.
Definition ext.H:1860
static Vec< T, SIMD_WIDTH > avgru(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the average of the elements of two Vec's, rounded up.
Definition ext.H:1564
static Vec< T, SIMD_WIDTH > add(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Adds the elements of two Vec's.
Definition base.H:357
static Vec< T, SIMD_WIDTH > mul(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Multiplies the elements of two Vec's.
Definition base.H:421
static Vec< T, SIMD_WIDTH > cmplt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than ( < ).
Definition base.H:924
static Vec< T, SIMD_WIDTH > cmple(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for less-than-or-equal ( <= ).
Definition base.H:945
static Vec< T, SIMD_WIDTH > cmpneq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for inequality ( != ).
Definition base.H:1029
static Vec< T, SIMD_WIDTH > cmpge(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than-or-equal ( >= ).
Definition base.H:987
static Vec< T, SIMD_WIDTH > cmpgt(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for greater-than ( > ).
Definition base.H:1008
static Vec< T, SIMD_WIDTH > cmpeq(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Compares corresponding elements of two Vec's for equality ( == ).
Definition base.H:966
float Float
Single-precision floating point number (32-bit)
Definition types.H:56
static Vec< T, SIMD_WIDTH > slle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec left by a constant number of elements, shifting in zero elements.
Definition base.H:1353
static Vec< T, SIMD_WIDTH > srle(const Vec< T, SIMD_WIDTH > &a)
Shifts a Vec right by a constant number of elements, shifting in zero elements.
Definition base.H:1338
static void fdivmul(const Vec< Tin, SIMD_WIDTH > vecsNum[numInVecs< Tout, Tin >()], const Vec< Tin, SIMD_WIDTH > vecsDenom[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Divides Vec's element-wise, then multiplies with a constant factor in floating point arithmetic.
Definition ext.H:748
static void fwaddmul(const Vec< Tin, SIMD_WIDTH > vecsIn1[numInVecs< Tout, Tin >()], const Vec< Tin, SIMD_WIDTH > vecsIn2[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > w, dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Linearly interpolates Vec's element-wise with a constant weight and then scales by a constant factor ...
Definition ext.H:1050
static void fmul(const Vec< Tin, SIMD_WIDTH > vecsIn[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Multiplies Vec's element-wise with a floating point constant in floating point arithmetic.
Definition ext.H:931
static void fdivMsigmoidmul(const Vec< Tin, SIMD_WIDTH > vecsNum[DIM][NVEC], const Vec< Tin, SIMD_WIDTH > vecsDenom[DIM][NVEC], const double w[DIM], const double w0[DIM], double fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Special function used in MinWarping.
Definition ext.H:901
static void faddmul(const Vec< Tin, SIMD_WIDTH > vecsIn[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > off, dont_deduce< Tfloat > fac, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Adds a floating point constant to the elements of Vec's, then multiplies with a floating point consta...
Definition ext.H:965
typename std::conditional< internal::vec::max(sizeof(Tout), sizeof(Tin))<= sizeof(Float), Float, Double >::type BigEnoughFloat
Smallest floating point type that is at least as big as the input and output types.
Definition vec.H:266
static void fmuladd(const Vec< Tin, SIMD_WIDTH > vecsIn[numInVecs< Tout, Tin >()], dont_deduce< Tfloat > fac, dont_deduce< Tfloat > off, Vec< Tout, SIMD_WIDTH > vecsOut[numOutVecs< Tout, Tin >()])
Multiplies the elements of Vec's with a floating point constant, then adds a floating point constant ...
Definition ext.H:1004
static T hmax(const Vec< T, SIMD_WIDTH > &v)
Calculates the maximum of all elements of a Vec.
Definition ext.H:1281
static Vec< T, SIMD_WIDTH > hadds(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's with saturation.
Definition base.H:493
static T hmin(const Vec< T, SIMD_WIDTH > &v)
Calculates the minimum of all elements of a Vec.
Definition ext.H:1267
static simd::Vec< T, SIMD_WIDTH > integrate(const simd::Vec< T, SIMD_WIDTH > &v)
Integrates the values of a Vec.
Definition ext.H:3330
static Vec< T, SIMD_WIDTH > hadd(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Horizontally adds adjacent elements of two Vec's.
Definition base.H:477
static Vec< T, SIMD_WIDTH > setunity()
Sets all elements of a Vec to the value 1.
Definition ext.H:3393
static Vec< T, SIMD_WIDTH > setmax()
Sets all elements of a Vec to the maximum value of the element type.
Definition ext.H:3381
static Vec< T, SIMD_WIDTH > setmin()
Sets all elements of a Vec to the minimum value of the element type.
Definition ext.H:3368
static Vec< T, SIMD_WIDTH > setnegunity()
Sets all elements of a Vec to the value -1.
Definition ext.H:3407
static Vec< T, SIMD_WIDTH > setones()
Sets all bits of a Vec to 1.
Definition ext.H:3348
static Vec< T, SIMD_WIDTH > setzero()
Returns a Vec with all elements set to zero.
Definition base.H:70
static Vec< T, SIMD_WIDTH > set1(const dont_deduce< T > a)
Returns a Vec with all elements set to the same value.
Definition base.H:88
Horizontal addition class for iterative horizontal accumulation.
Definition ext.H:1452
Horizontal saturated addition class for iterative horizontal accumulation.
Definition ext.H:1476
Horizontal maximum class for iterative horizontal accumulation.
Definition ext.H:1522
Horizontal minimum class for iterative horizontal accumulation.
Definition ext.H:1499
static Vec< T, SIMD_WIDTH > bit_and(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise AND of two Vec's.
Definition base.H:732
static Vec< T, SIMD_WIDTH > bit_xor(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise XOR of two Vec's.
Definition base.H:776
static Vec< T, SIMD_WIDTH > bit_or(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the bitwise OR of two Vec's.
Definition base.H:746
static Vec< T, SIMD_WIDTH > bit_not(const Vec< T, SIMD_WIDTH > &a)
Computes the bitwise NOT of a Vec.
Definition base.H:789
static Vec< T, SIMD_WIDTH > sqrt(const Vec< T, SIMD_WIDTH > &a)
Computes the square root of the elements of a Vec.
Definition base.H:584
static Vec< T, SIMD_WIDTH > min(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the minimum of the elements of two Vec's.
Definition base.H:606
static Vec< T, SIMD_WIDTH > max(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Computes the maximum of the elements of two Vec's.
Definition base.H:620
static Vec< T, SIMD_WIDTH > neg(const Vec< T, SIMD_WIDTH > &a)
Negates the elements of a Vec.
Definition base.H:635
static Vec< T, SIMD_WIDTH > abs(const Vec< T, SIMD_WIDTH > &a)
Computes the absolute value of the elements of a Vec.
Definition base.H:654
static Vec< T, SIMD_WIDTH > load(const T *const p)
Loads a Vec from aligned memory.
Definition base.H:209
static Vec< T, SIMD_WIDTH > loadu(const T *const p)
Loads a Vec from unaligned memory.
Definition base.H:231
static void store(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to aligned memory.
Definition base.H:246
static void storeu(T *const p, const Vec< T, SIMD_WIDTH > &a)
Stores a Vec to unaligned memory.
Definition base.H:265
static void load_storeu(const T *const src, T *const dst)
Copies a single Vec from one aligned memory location to another unaligned memory location.
Definition ext.H:491
static void loadu_storeu(const T *const src, T *const dst)
Copies a single Vec from one unaligned memory location to another unaligned memory location.
Definition ext.H:507
static void loadu_store(const T *const src, T *const dst)
Copies a single Vec from one unaligned memory location to another aligned memory location.
Definition ext.H:474
static void load_store(const T *const src, T *const dst)
Copies a single Vec from one aligned memory location to another aligned memory location.
Definition ext.H:457
static void print(const char *format, const Vec< T, SIMD_WIDTH > &vec)
Writes the formatted elements of a Vec to stdout.
Definition ext.H:157
static void fprint(FILE *f, const char *format, const Vec< T, SIMD_WIDTH > &vec)
Writes the formatted elements of a Vec to a file.
Definition ext.H:127
static void transpose(const Vec< T, SIMD_WIDTH > inRows[Vec< T, SIMD_WIDTH >::elems], Vec< T, SIMD_WIDTH > outRows[Vec< T, SIMD_WIDTH >::elems])
Transposes a matrix held in an array of Vec's.
Definition ext.H:3223
static Vec< T, SIMD_WIDTH > reverse(const Vec< T, SIMD_WIDTH > &a)
Reverses the order of the elements of a Vec.
Definition base.H:1101
static Vec< T, SIMD_WIDTH > srli(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in zeros.
Definition base.H:828
static Vec< T, SIMD_WIDTH > srai(const Vec< T, SIMD_WIDTH > &a)
Shifts the elements of a Vec right by a constant number of bits while shifting in the sign bit.
Definition base.H:812
static void bitonicSortSortedPairs(Vec< T, SIMD_WIDTH > vecs[Vec< T, SIMD_WIDTH >::elems])
Fuses consecutive pairs of sorted Vec's such that the pair is sorted over the two vectors.
Definition ext.H:3590
static void bitonicSort(Vec< T, SIMD_WIDTH > vecs[Vec< T, SIMD_WIDTH >::elems])
Sorts multiple Vec's independently using the bitonic sort algorithm.
Definition ext.H:3554
static void verticalBitonicSort(T data[LENGTH])
Sorts data vector using vertical version of bitonic sort. Assumes that data size is a power of 2 time...
Definition ext.H:3619
static void unswizzle(Vec< T, SIMD_WIDTH > v[2 *N])
Unswizzle/interleave/convert from SoA to AoS multiple Vec's in-place.
Definition ext.H:3203
static void swizzle2(Vec< T, SIMD_WIDTH > v[2 *N])
Swizzle/de-interleave/convert from AoS to SoA multiple Vec's in-place.
Definition ext.H:3160
static Vec< Tout, SIMD_WIDTH > cvts(const Vec< Tin, SIMD_WIDTH > &a)
Converts the elements of a Vec between integer and floating point types of the same size.
Definition base.H:1445
static Vec< Tout, SIMD_WIDTH > packs(const Vec< Tin, SIMD_WIDTH > &a, const Vec< Tin, SIMD_WIDTH > &b)
Packs two Vec's into one by converting the elements into the next smaller type with saturation.
Definition base.H:1397
static void extend(const Vec< Tin, SIMD_WIDTH > &vIn, Vec< Tout, SIMD_WIDTH > vOut[sizeof(Tout)/sizeof(Tin)])
Extends the elements of a Vec to a larger or equally sized type.
Definition base.H:1423
static constexpr size_t numInVecs()
Number of input vectors for functions that potentially change the size of the elements but not the nu...
Definition vec.H:201
static constexpr size_t numOutVecs()
Number of output vectors for functions that potentially change the size of the elements but not the n...
Definition vec.H:216
static void convert(const Vec< Tin, SIMD_WIDTH > inVecs[numInVecs< Tout, Tin >()], Vec< Tout, SIMD_WIDTH > outVecs[numOutVecs< Tout, Tin >()])
Converts (potentially multiple) Vec's between different types.
Definition ext.H:676
SortSlope
Used to indicate the direction of a sort function.
Definition types.H:115
static void zip16(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Interleaves blocks of elements of each 16-byte lane of two Vec's.
Definition base.H:1286
static void zip(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Interleaves blocks of elements of two Vec's.
Definition base.H:1247
static Vec< T, SIMD_WIDTH > unpack(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Interleaves blocks of elements from the high or low half of two Vec's.
Definition base.H:1169
static void unzip(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Deinterleaves blocks of elements two Vec's.
Definition base.H:1316
static Vec< T, SIMD_WIDTH > unpack16(const Vec< T, SIMD_WIDTH > &a, const Vec< T, SIMD_WIDTH > &b)
Interleaves blocks of elements from the high or low half of each 16-byte lane of two Vec's.
Definition base.H:1209
Namespace for T-SIMD.
Definition time_measurement.H:161
typename internal::dont_deduce< T >::type dont_deduce
Helper type to prevent template argument deduction.
Definition types.H:416
Iterative horizontal accumulator. Calculates the horizontal accumulation of multiple (Vec<T,...
Definition ext.H:1311
bool isEmpty() const
Checks if the horizontal accumulation is empty, i.e. if no Vec has been pushed yet.
Definition ext.H:1324
void push(const Vec< T, SIMD_WIDTH > &v)
Pushes the next Vec to be horizontally accumulated. Does nothing if the horizontal accumulation is al...
Definition ext.H:1339
Vec< T, SIMD_WIDTH > get()
Gets the result of the horizontal accumulation. Finishes the horizontal accumulation if it is not don...
Definition ext.H:1369
void reset()
Resets the horizontal accumulation.
Definition ext.H:1378
void finish()
Finishes the horizontal accumulation by pushing neutral values until the horizontal accumulation is d...
Definition ext.H:1356
bool isDone() const
Checks if the horizontal accumulation is done.
Definition ext.H:1331