T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
ext_transpose.H
1// ===========================================================================
2//
3// auto-generated transpose functions with in-place processing
4// DO NOT EDIT!!!
5//
6// This source code file is part of the following software:
7//
8// - the low-level C++ template SIMD library
9// - the SIMD implementation of the MinWarping and the 2D-Warping methods
10// for local visual homing.
11//
12// The software is provided based on the accompanying license agreement in the
13// file LICENSE.md.
14// The software is provided "as is" without any warranty by the licensor and
15// without any liability of the licensor, and the software may not be
16// distributed by the licensee; see the license agreement for details.
17//
18// (C) Ralf Möller
19// Computer Engineering
20// Faculty of Technology
21// Bielefeld University
22// www.ti.uni-bielefeld.de
23//
24// ===========================================================================
25
26#pragma once
27#ifndef SIMD_VEC_EXT_TRANSPOSE_AUTOGEN_H_
28#define SIMD_VEC_EXT_TRANSPOSE_AUTOGEN_H_
29
30#include "../base.H"
31
32#include <utility>
33
34namespace simd {
35namespace internal {
36namespace ext {
37// ==========================================================
38// transpose1inplc
39// ==========================================================
40
41template <typename T, size_t SIMD_WIDTH>
42static SIMD_INLINE void transpose1inplc(
43 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
44 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<2>)
45{
46 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
47}
48
49template <typename T, size_t SIMD_WIDTH>
50static SIMD_INLINE void transpose1inplc(
51 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
52 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<4>)
53{
54 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
55 zip<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
56 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
57 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
58 std::swap(outRows[1], outRows[2]);
59}
60
61template <typename T, size_t SIMD_WIDTH>
62static SIMD_INLINE void transpose1inplc(
63 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
64 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>)
65{
66 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
67 zip<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
68 zip<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
69 zip<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
70 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
71 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
72 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
73 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
74 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
75 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
76 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
77 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
78 std::swap(outRows[1], outRows[4]);
79 std::swap(outRows[3], outRows[6]);
80}
81
82template <typename T, size_t SIMD_WIDTH>
83static SIMD_INLINE void transpose1inplc(
84 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
85 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>)
86{
87 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
88 zip<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
89 zip<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
90 zip<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
91 zip<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
92 zip<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
93 zip<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
94 zip<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
95 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
96 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
97 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
98 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
99 zip<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
100 zip<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
101 zip<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
102 zip<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
103 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
104 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
105 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
106 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
107 zip<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
108 zip<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
109 zip<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
110 zip<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
111 zip<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
112 zip<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
113 zip<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
114 zip<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
115 zip<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
116 zip<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
117 zip<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
118 zip<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
119 std::swap(outRows[1], outRows[8]);
120 std::swap(outRows[2], outRows[4]);
121 std::swap(outRows[3], outRows[12]);
122 std::swap(outRows[5], outRows[10]);
123 std::swap(outRows[7], outRows[14]);
124 std::swap(outRows[11], outRows[13]);
125}
126
127template <typename T, size_t SIMD_WIDTH>
128static SIMD_INLINE void transpose1inplc(
129 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
130 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<32>)
131{
132 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
133 zip<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
134 zip<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
135 zip<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
136 zip<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
137 zip<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
138 zip<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
139 zip<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
140 zip<1>(inRows[16], inRows[17], outRows[16], outRows[17]);
141 zip<1>(inRows[18], inRows[19], outRows[18], outRows[19]);
142 zip<1>(inRows[20], inRows[21], outRows[20], outRows[21]);
143 zip<1>(inRows[22], inRows[23], outRows[22], outRows[23]);
144 zip<1>(inRows[24], inRows[25], outRows[24], outRows[25]);
145 zip<1>(inRows[26], inRows[27], outRows[26], outRows[27]);
146 zip<1>(inRows[28], inRows[29], outRows[28], outRows[29]);
147 zip<1>(inRows[30], inRows[31], outRows[30], outRows[31]);
148 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
149 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
150 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
151 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
152 zip<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
153 zip<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
154 zip<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
155 zip<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
156 zip<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
157 zip<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
158 zip<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
159 zip<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
160 zip<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
161 zip<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
162 zip<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
163 zip<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
164 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
165 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
166 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
167 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
168 zip<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
169 zip<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
170 zip<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
171 zip<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
172 zip<4>(outRows[16], outRows[20], outRows[16], outRows[20]);
173 zip<4>(outRows[18], outRows[22], outRows[18], outRows[22]);
174 zip<4>(outRows[17], outRows[21], outRows[17], outRows[21]);
175 zip<4>(outRows[19], outRows[23], outRows[19], outRows[23]);
176 zip<4>(outRows[24], outRows[28], outRows[24], outRows[28]);
177 zip<4>(outRows[26], outRows[30], outRows[26], outRows[30]);
178 zip<4>(outRows[25], outRows[29], outRows[25], outRows[29]);
179 zip<4>(outRows[27], outRows[31], outRows[27], outRows[31]);
180 zip<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
181 zip<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
182 zip<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
183 zip<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
184 zip<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
185 zip<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
186 zip<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
187 zip<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
188 zip<8>(outRows[16], outRows[24], outRows[16], outRows[24]);
189 zip<8>(outRows[20], outRows[28], outRows[20], outRows[28]);
190 zip<8>(outRows[18], outRows[26], outRows[18], outRows[26]);
191 zip<8>(outRows[22], outRows[30], outRows[22], outRows[30]);
192 zip<8>(outRows[17], outRows[25], outRows[17], outRows[25]);
193 zip<8>(outRows[21], outRows[29], outRows[21], outRows[29]);
194 zip<8>(outRows[19], outRows[27], outRows[19], outRows[27]);
195 zip<8>(outRows[23], outRows[31], outRows[23], outRows[31]);
196 zip<16>(outRows[0], outRows[16], outRows[0], outRows[16]);
197 zip<16>(outRows[8], outRows[24], outRows[8], outRows[24]);
198 zip<16>(outRows[4], outRows[20], outRows[4], outRows[20]);
199 zip<16>(outRows[12], outRows[28], outRows[12], outRows[28]);
200 zip<16>(outRows[2], outRows[18], outRows[2], outRows[18]);
201 zip<16>(outRows[10], outRows[26], outRows[10], outRows[26]);
202 zip<16>(outRows[6], outRows[22], outRows[6], outRows[22]);
203 zip<16>(outRows[14], outRows[30], outRows[14], outRows[30]);
204 zip<16>(outRows[1], outRows[17], outRows[1], outRows[17]);
205 zip<16>(outRows[9], outRows[25], outRows[9], outRows[25]);
206 zip<16>(outRows[5], outRows[21], outRows[5], outRows[21]);
207 zip<16>(outRows[13], outRows[29], outRows[13], outRows[29]);
208 zip<16>(outRows[3], outRows[19], outRows[3], outRows[19]);
209 zip<16>(outRows[11], outRows[27], outRows[11], outRows[27]);
210 zip<16>(outRows[7], outRows[23], outRows[7], outRows[23]);
211 zip<16>(outRows[15], outRows[31], outRows[15], outRows[31]);
212 std::swap(outRows[1], outRows[16]);
213 std::swap(outRows[2], outRows[8]);
214 std::swap(outRows[3], outRows[24]);
215 std::swap(outRows[5], outRows[20]);
216 std::swap(outRows[6], outRows[12]);
217 std::swap(outRows[7], outRows[28]);
218 std::swap(outRows[9], outRows[18]);
219 std::swap(outRows[11], outRows[26]);
220 std::swap(outRows[13], outRows[22]);
221 std::swap(outRows[15], outRows[30]);
222 std::swap(outRows[19], outRows[25]);
223 std::swap(outRows[23], outRows[29]);
224}
225
226template <typename T, size_t SIMD_WIDTH>
227static SIMD_INLINE void transpose1inplc(
228 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
229 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<64>)
230{
231 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
232 zip<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
233 zip<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
234 zip<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
235 zip<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
236 zip<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
237 zip<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
238 zip<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
239 zip<1>(inRows[16], inRows[17], outRows[16], outRows[17]);
240 zip<1>(inRows[18], inRows[19], outRows[18], outRows[19]);
241 zip<1>(inRows[20], inRows[21], outRows[20], outRows[21]);
242 zip<1>(inRows[22], inRows[23], outRows[22], outRows[23]);
243 zip<1>(inRows[24], inRows[25], outRows[24], outRows[25]);
244 zip<1>(inRows[26], inRows[27], outRows[26], outRows[27]);
245 zip<1>(inRows[28], inRows[29], outRows[28], outRows[29]);
246 zip<1>(inRows[30], inRows[31], outRows[30], outRows[31]);
247 zip<1>(inRows[32], inRows[33], outRows[32], outRows[33]);
248 zip<1>(inRows[34], inRows[35], outRows[34], outRows[35]);
249 zip<1>(inRows[36], inRows[37], outRows[36], outRows[37]);
250 zip<1>(inRows[38], inRows[39], outRows[38], outRows[39]);
251 zip<1>(inRows[40], inRows[41], outRows[40], outRows[41]);
252 zip<1>(inRows[42], inRows[43], outRows[42], outRows[43]);
253 zip<1>(inRows[44], inRows[45], outRows[44], outRows[45]);
254 zip<1>(inRows[46], inRows[47], outRows[46], outRows[47]);
255 zip<1>(inRows[48], inRows[49], outRows[48], outRows[49]);
256 zip<1>(inRows[50], inRows[51], outRows[50], outRows[51]);
257 zip<1>(inRows[52], inRows[53], outRows[52], outRows[53]);
258 zip<1>(inRows[54], inRows[55], outRows[54], outRows[55]);
259 zip<1>(inRows[56], inRows[57], outRows[56], outRows[57]);
260 zip<1>(inRows[58], inRows[59], outRows[58], outRows[59]);
261 zip<1>(inRows[60], inRows[61], outRows[60], outRows[61]);
262 zip<1>(inRows[62], inRows[63], outRows[62], outRows[63]);
263 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
264 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
265 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
266 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
267 zip<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
268 zip<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
269 zip<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
270 zip<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
271 zip<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
272 zip<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
273 zip<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
274 zip<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
275 zip<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
276 zip<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
277 zip<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
278 zip<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
279 zip<2>(outRows[32], outRows[34], outRows[32], outRows[34]);
280 zip<2>(outRows[33], outRows[35], outRows[33], outRows[35]);
281 zip<2>(outRows[36], outRows[38], outRows[36], outRows[38]);
282 zip<2>(outRows[37], outRows[39], outRows[37], outRows[39]);
283 zip<2>(outRows[40], outRows[42], outRows[40], outRows[42]);
284 zip<2>(outRows[41], outRows[43], outRows[41], outRows[43]);
285 zip<2>(outRows[44], outRows[46], outRows[44], outRows[46]);
286 zip<2>(outRows[45], outRows[47], outRows[45], outRows[47]);
287 zip<2>(outRows[48], outRows[50], outRows[48], outRows[50]);
288 zip<2>(outRows[49], outRows[51], outRows[49], outRows[51]);
289 zip<2>(outRows[52], outRows[54], outRows[52], outRows[54]);
290 zip<2>(outRows[53], outRows[55], outRows[53], outRows[55]);
291 zip<2>(outRows[56], outRows[58], outRows[56], outRows[58]);
292 zip<2>(outRows[57], outRows[59], outRows[57], outRows[59]);
293 zip<2>(outRows[60], outRows[62], outRows[60], outRows[62]);
294 zip<2>(outRows[61], outRows[63], outRows[61], outRows[63]);
295 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
296 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
297 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
298 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
299 zip<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
300 zip<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
301 zip<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
302 zip<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
303 zip<4>(outRows[16], outRows[20], outRows[16], outRows[20]);
304 zip<4>(outRows[18], outRows[22], outRows[18], outRows[22]);
305 zip<4>(outRows[17], outRows[21], outRows[17], outRows[21]);
306 zip<4>(outRows[19], outRows[23], outRows[19], outRows[23]);
307 zip<4>(outRows[24], outRows[28], outRows[24], outRows[28]);
308 zip<4>(outRows[26], outRows[30], outRows[26], outRows[30]);
309 zip<4>(outRows[25], outRows[29], outRows[25], outRows[29]);
310 zip<4>(outRows[27], outRows[31], outRows[27], outRows[31]);
311 zip<4>(outRows[32], outRows[36], outRows[32], outRows[36]);
312 zip<4>(outRows[34], outRows[38], outRows[34], outRows[38]);
313 zip<4>(outRows[33], outRows[37], outRows[33], outRows[37]);
314 zip<4>(outRows[35], outRows[39], outRows[35], outRows[39]);
315 zip<4>(outRows[40], outRows[44], outRows[40], outRows[44]);
316 zip<4>(outRows[42], outRows[46], outRows[42], outRows[46]);
317 zip<4>(outRows[41], outRows[45], outRows[41], outRows[45]);
318 zip<4>(outRows[43], outRows[47], outRows[43], outRows[47]);
319 zip<4>(outRows[48], outRows[52], outRows[48], outRows[52]);
320 zip<4>(outRows[50], outRows[54], outRows[50], outRows[54]);
321 zip<4>(outRows[49], outRows[53], outRows[49], outRows[53]);
322 zip<4>(outRows[51], outRows[55], outRows[51], outRows[55]);
323 zip<4>(outRows[56], outRows[60], outRows[56], outRows[60]);
324 zip<4>(outRows[58], outRows[62], outRows[58], outRows[62]);
325 zip<4>(outRows[57], outRows[61], outRows[57], outRows[61]);
326 zip<4>(outRows[59], outRows[63], outRows[59], outRows[63]);
327 zip<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
328 zip<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
329 zip<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
330 zip<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
331 zip<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
332 zip<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
333 zip<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
334 zip<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
335 zip<8>(outRows[16], outRows[24], outRows[16], outRows[24]);
336 zip<8>(outRows[20], outRows[28], outRows[20], outRows[28]);
337 zip<8>(outRows[18], outRows[26], outRows[18], outRows[26]);
338 zip<8>(outRows[22], outRows[30], outRows[22], outRows[30]);
339 zip<8>(outRows[17], outRows[25], outRows[17], outRows[25]);
340 zip<8>(outRows[21], outRows[29], outRows[21], outRows[29]);
341 zip<8>(outRows[19], outRows[27], outRows[19], outRows[27]);
342 zip<8>(outRows[23], outRows[31], outRows[23], outRows[31]);
343 zip<8>(outRows[32], outRows[40], outRows[32], outRows[40]);
344 zip<8>(outRows[36], outRows[44], outRows[36], outRows[44]);
345 zip<8>(outRows[34], outRows[42], outRows[34], outRows[42]);
346 zip<8>(outRows[38], outRows[46], outRows[38], outRows[46]);
347 zip<8>(outRows[33], outRows[41], outRows[33], outRows[41]);
348 zip<8>(outRows[37], outRows[45], outRows[37], outRows[45]);
349 zip<8>(outRows[35], outRows[43], outRows[35], outRows[43]);
350 zip<8>(outRows[39], outRows[47], outRows[39], outRows[47]);
351 zip<8>(outRows[48], outRows[56], outRows[48], outRows[56]);
352 zip<8>(outRows[52], outRows[60], outRows[52], outRows[60]);
353 zip<8>(outRows[50], outRows[58], outRows[50], outRows[58]);
354 zip<8>(outRows[54], outRows[62], outRows[54], outRows[62]);
355 zip<8>(outRows[49], outRows[57], outRows[49], outRows[57]);
356 zip<8>(outRows[53], outRows[61], outRows[53], outRows[61]);
357 zip<8>(outRows[51], outRows[59], outRows[51], outRows[59]);
358 zip<8>(outRows[55], outRows[63], outRows[55], outRows[63]);
359 zip<16>(outRows[0], outRows[16], outRows[0], outRows[16]);
360 zip<16>(outRows[8], outRows[24], outRows[8], outRows[24]);
361 zip<16>(outRows[4], outRows[20], outRows[4], outRows[20]);
362 zip<16>(outRows[12], outRows[28], outRows[12], outRows[28]);
363 zip<16>(outRows[2], outRows[18], outRows[2], outRows[18]);
364 zip<16>(outRows[10], outRows[26], outRows[10], outRows[26]);
365 zip<16>(outRows[6], outRows[22], outRows[6], outRows[22]);
366 zip<16>(outRows[14], outRows[30], outRows[14], outRows[30]);
367 zip<16>(outRows[1], outRows[17], outRows[1], outRows[17]);
368 zip<16>(outRows[9], outRows[25], outRows[9], outRows[25]);
369 zip<16>(outRows[5], outRows[21], outRows[5], outRows[21]);
370 zip<16>(outRows[13], outRows[29], outRows[13], outRows[29]);
371 zip<16>(outRows[3], outRows[19], outRows[3], outRows[19]);
372 zip<16>(outRows[11], outRows[27], outRows[11], outRows[27]);
373 zip<16>(outRows[7], outRows[23], outRows[7], outRows[23]);
374 zip<16>(outRows[15], outRows[31], outRows[15], outRows[31]);
375 zip<16>(outRows[32], outRows[48], outRows[32], outRows[48]);
376 zip<16>(outRows[40], outRows[56], outRows[40], outRows[56]);
377 zip<16>(outRows[36], outRows[52], outRows[36], outRows[52]);
378 zip<16>(outRows[44], outRows[60], outRows[44], outRows[60]);
379 zip<16>(outRows[34], outRows[50], outRows[34], outRows[50]);
380 zip<16>(outRows[42], outRows[58], outRows[42], outRows[58]);
381 zip<16>(outRows[38], outRows[54], outRows[38], outRows[54]);
382 zip<16>(outRows[46], outRows[62], outRows[46], outRows[62]);
383 zip<16>(outRows[33], outRows[49], outRows[33], outRows[49]);
384 zip<16>(outRows[41], outRows[57], outRows[41], outRows[57]);
385 zip<16>(outRows[37], outRows[53], outRows[37], outRows[53]);
386 zip<16>(outRows[45], outRows[61], outRows[45], outRows[61]);
387 zip<16>(outRows[35], outRows[51], outRows[35], outRows[51]);
388 zip<16>(outRows[43], outRows[59], outRows[43], outRows[59]);
389 zip<16>(outRows[39], outRows[55], outRows[39], outRows[55]);
390 zip<16>(outRows[47], outRows[63], outRows[47], outRows[63]);
391 zip<32>(outRows[0], outRows[32], outRows[0], outRows[32]);
392 zip<32>(outRows[16], outRows[48], outRows[16], outRows[48]);
393 zip<32>(outRows[8], outRows[40], outRows[8], outRows[40]);
394 zip<32>(outRows[24], outRows[56], outRows[24], outRows[56]);
395 zip<32>(outRows[4], outRows[36], outRows[4], outRows[36]);
396 zip<32>(outRows[20], outRows[52], outRows[20], outRows[52]);
397 zip<32>(outRows[12], outRows[44], outRows[12], outRows[44]);
398 zip<32>(outRows[28], outRows[60], outRows[28], outRows[60]);
399 zip<32>(outRows[2], outRows[34], outRows[2], outRows[34]);
400 zip<32>(outRows[18], outRows[50], outRows[18], outRows[50]);
401 zip<32>(outRows[10], outRows[42], outRows[10], outRows[42]);
402 zip<32>(outRows[26], outRows[58], outRows[26], outRows[58]);
403 zip<32>(outRows[6], outRows[38], outRows[6], outRows[38]);
404 zip<32>(outRows[22], outRows[54], outRows[22], outRows[54]);
405 zip<32>(outRows[14], outRows[46], outRows[14], outRows[46]);
406 zip<32>(outRows[30], outRows[62], outRows[30], outRows[62]);
407 zip<32>(outRows[1], outRows[33], outRows[1], outRows[33]);
408 zip<32>(outRows[17], outRows[49], outRows[17], outRows[49]);
409 zip<32>(outRows[9], outRows[41], outRows[9], outRows[41]);
410 zip<32>(outRows[25], outRows[57], outRows[25], outRows[57]);
411 zip<32>(outRows[5], outRows[37], outRows[5], outRows[37]);
412 zip<32>(outRows[21], outRows[53], outRows[21], outRows[53]);
413 zip<32>(outRows[13], outRows[45], outRows[13], outRows[45]);
414 zip<32>(outRows[29], outRows[61], outRows[29], outRows[61]);
415 zip<32>(outRows[3], outRows[35], outRows[3], outRows[35]);
416 zip<32>(outRows[19], outRows[51], outRows[19], outRows[51]);
417 zip<32>(outRows[11], outRows[43], outRows[11], outRows[43]);
418 zip<32>(outRows[27], outRows[59], outRows[27], outRows[59]);
419 zip<32>(outRows[7], outRows[39], outRows[7], outRows[39]);
420 zip<32>(outRows[23], outRows[55], outRows[23], outRows[55]);
421 zip<32>(outRows[15], outRows[47], outRows[15], outRows[47]);
422 zip<32>(outRows[31], outRows[63], outRows[31], outRows[63]);
423 std::swap(outRows[1], outRows[32]);
424 std::swap(outRows[2], outRows[16]);
425 std::swap(outRows[3], outRows[48]);
426 std::swap(outRows[4], outRows[8]);
427 std::swap(outRows[5], outRows[40]);
428 std::swap(outRows[6], outRows[24]);
429 std::swap(outRows[7], outRows[56]);
430 std::swap(outRows[9], outRows[36]);
431 std::swap(outRows[10], outRows[20]);
432 std::swap(outRows[11], outRows[52]);
433 std::swap(outRows[13], outRows[44]);
434 std::swap(outRows[14], outRows[28]);
435 std::swap(outRows[15], outRows[60]);
436 std::swap(outRows[17], outRows[34]);
437 std::swap(outRows[19], outRows[50]);
438 std::swap(outRows[21], outRows[42]);
439 std::swap(outRows[22], outRows[26]);
440 std::swap(outRows[23], outRows[58]);
441 std::swap(outRows[25], outRows[38]);
442 std::swap(outRows[27], outRows[54]);
443 std::swap(outRows[29], outRows[46]);
444 std::swap(outRows[31], outRows[62]);
445 std::swap(outRows[35], outRows[49]);
446 std::swap(outRows[37], outRows[41]);
447 std::swap(outRows[39], outRows[57]);
448 std::swap(outRows[43], outRows[53]);
449 std::swap(outRows[47], outRows[61]);
450 std::swap(outRows[55], outRows[59]);
451}
452
453template <typename T, size_t SIMD_WIDTH>
454static SIMD_INLINE void transpose1inplc(
455 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
456 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
457{
458 transpose1inplc(inRows, outRows, Elements<Vec<T, SIMD_WIDTH>::elements>());
459}
460
461// ==========================================================
462// transpose1inplcLane
463// ==========================================================
464
465template <typename T, size_t SIMD_WIDTH>
466static SIMD_INLINE void transpose1inplcLane(
467 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
468 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>,
469 Bytes<16>)
470{
471 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
472 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
473 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
474 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
475 zip16<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
476 zip16<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
477 zip16<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
478 zip16<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
479 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
480 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
481 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
482 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
483 zip16<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
484 zip16<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
485 zip16<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
486 zip16<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
487 zip16<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
488 zip16<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
489 zip16<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
490 zip16<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
491 zip16<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
492 zip16<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
493 zip16<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
494 zip16<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
495 zip16<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
496 zip16<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
497 zip16<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
498 zip16<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
499 zip16<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
500 zip16<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
501 zip16<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
502 zip16<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
503 std::swap(outRows[1], outRows[8]);
504 std::swap(outRows[2], outRows[4]);
505 std::swap(outRows[3], outRows[12]);
506 std::swap(outRows[5], outRows[10]);
507 std::swap(outRows[7], outRows[14]);
508 std::swap(outRows[11], outRows[13]);
509 // correction steps follow below (if required)
510}
511
512template <typename T, size_t SIMD_WIDTH>
513static SIMD_INLINE void transpose1inplcLane(
514 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
515 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<16>)
516{
517 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
518 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
519 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
520 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
521 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
522 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
523 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
524 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
525 zip16<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
526 zip16<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
527 zip16<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
528 zip16<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
529 std::swap(outRows[1], outRows[4]);
530 std::swap(outRows[3], outRows[6]);
531 // correction steps follow below (if required)
532}
533
534template <typename T, size_t SIMD_WIDTH>
535static SIMD_INLINE void transpose1inplcLane(
536 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
537 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<16>)
538{
539 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
540 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
541 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
542 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
543 std::swap(outRows[1], outRows[2]);
544 // correction steps follow below (if required)
545}
546
547template <typename T, size_t SIMD_WIDTH>
548static SIMD_INLINE void transpose1inplcLane(
549 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
550 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<2>, Bytes<16>)
551{
552 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
553 // correction steps follow below (if required)
554}
555
556template <typename T, size_t SIMD_WIDTH>
557static SIMD_INLINE void transpose1inplcLane(
558 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
559 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<32>,
560 Bytes<32>)
561{
562 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
563 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
564 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
565 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
566 zip16<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
567 zip16<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
568 zip16<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
569 zip16<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
570 zip16<1>(inRows[16], inRows[17], outRows[16], outRows[17]);
571 zip16<1>(inRows[18], inRows[19], outRows[18], outRows[19]);
572 zip16<1>(inRows[20], inRows[21], outRows[20], outRows[21]);
573 zip16<1>(inRows[22], inRows[23], outRows[22], outRows[23]);
574 zip16<1>(inRows[24], inRows[25], outRows[24], outRows[25]);
575 zip16<1>(inRows[26], inRows[27], outRows[26], outRows[27]);
576 zip16<1>(inRows[28], inRows[29], outRows[28], outRows[29]);
577 zip16<1>(inRows[30], inRows[31], outRows[30], outRows[31]);
578 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
579 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
580 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
581 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
582 zip16<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
583 zip16<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
584 zip16<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
585 zip16<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
586 zip16<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
587 zip16<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
588 zip16<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
589 zip16<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
590 zip16<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
591 zip16<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
592 zip16<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
593 zip16<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
594 zip16<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
595 zip16<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
596 zip16<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
597 zip16<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
598 zip16<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
599 zip16<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
600 zip16<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
601 zip16<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
602 zip16<4>(outRows[16], outRows[20], outRows[16], outRows[20]);
603 zip16<4>(outRows[18], outRows[22], outRows[18], outRows[22]);
604 zip16<4>(outRows[17], outRows[21], outRows[17], outRows[21]);
605 zip16<4>(outRows[19], outRows[23], outRows[19], outRows[23]);
606 zip16<4>(outRows[24], outRows[28], outRows[24], outRows[28]);
607 zip16<4>(outRows[26], outRows[30], outRows[26], outRows[30]);
608 zip16<4>(outRows[25], outRows[29], outRows[25], outRows[29]);
609 zip16<4>(outRows[27], outRows[31], outRows[27], outRows[31]);
610 zip16<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
611 zip16<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
612 zip16<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
613 zip16<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
614 zip16<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
615 zip16<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
616 zip16<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
617 zip16<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
618 zip16<8>(outRows[16], outRows[24], outRows[16], outRows[24]);
619 zip16<8>(outRows[20], outRows[28], outRows[20], outRows[28]);
620 zip16<8>(outRows[18], outRows[26], outRows[18], outRows[26]);
621 zip16<8>(outRows[22], outRows[30], outRows[22], outRows[30]);
622 zip16<8>(outRows[17], outRows[25], outRows[17], outRows[25]);
623 zip16<8>(outRows[21], outRows[29], outRows[21], outRows[29]);
624 zip16<8>(outRows[19], outRows[27], outRows[19], outRows[27]);
625 zip16<8>(outRows[23], outRows[31], outRows[23], outRows[31]);
626 std::swap(outRows[1], outRows[8]);
627 std::swap(outRows[2], outRows[4]);
628 std::swap(outRows[3], outRows[12]);
629 std::swap(outRows[5], outRows[10]);
630 std::swap(outRows[7], outRows[14]);
631 std::swap(outRows[11], outRows[13]);
632 std::swap(outRows[17], outRows[24]);
633 std::swap(outRows[18], outRows[20]);
634 std::swap(outRows[19], outRows[28]);
635 std::swap(outRows[21], outRows[26]);
636 std::swap(outRows[23], outRows[30]);
637 std::swap(outRows[27], outRows[29]);
638 // correction steps follow below (if required)
639 zip<16>(outRows[0], outRows[16], outRows[0], outRows[16]);
640 zip<16>(outRows[1], outRows[17], outRows[1], outRows[17]);
641 zip<16>(outRows[2], outRows[18], outRows[2], outRows[18]);
642 zip<16>(outRows[3], outRows[19], outRows[3], outRows[19]);
643 zip<16>(outRows[4], outRows[20], outRows[4], outRows[20]);
644 zip<16>(outRows[5], outRows[21], outRows[5], outRows[21]);
645 zip<16>(outRows[6], outRows[22], outRows[6], outRows[22]);
646 zip<16>(outRows[7], outRows[23], outRows[7], outRows[23]);
647 zip<16>(outRows[8], outRows[24], outRows[8], outRows[24]);
648 zip<16>(outRows[9], outRows[25], outRows[9], outRows[25]);
649 zip<16>(outRows[10], outRows[26], outRows[10], outRows[26]);
650 zip<16>(outRows[11], outRows[27], outRows[11], outRows[27]);
651 zip<16>(outRows[12], outRows[28], outRows[12], outRows[28]);
652 zip<16>(outRows[13], outRows[29], outRows[13], outRows[29]);
653 zip<16>(outRows[14], outRows[30], outRows[14], outRows[30]);
654 zip<16>(outRows[15], outRows[31], outRows[15], outRows[31]);
655}
656
657template <typename T, size_t SIMD_WIDTH>
658static SIMD_INLINE void transpose1inplcLane(
659 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
660 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>,
661 Bytes<32>)
662{
663 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
664 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
665 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
666 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
667 zip16<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
668 zip16<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
669 zip16<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
670 zip16<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
671 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
672 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
673 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
674 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
675 zip16<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
676 zip16<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
677 zip16<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
678 zip16<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
679 zip16<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
680 zip16<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
681 zip16<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
682 zip16<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
683 zip16<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
684 zip16<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
685 zip16<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
686 zip16<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
687 std::swap(outRows[1], outRows[4]);
688 std::swap(outRows[3], outRows[6]);
689 std::swap(outRows[9], outRows[12]);
690 std::swap(outRows[11], outRows[14]);
691 // correction steps follow below (if required)
692 zip<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
693 zip<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
694 zip<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
695 zip<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
696 zip<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
697 zip<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
698 zip<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
699 zip<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
700}
701
702template <typename T, size_t SIMD_WIDTH>
703static SIMD_INLINE void transpose1inplcLane(
704 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
705 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<32>)
706{
707 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
708 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
709 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
710 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
711 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
712 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
713 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
714 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
715 std::swap(outRows[1], outRows[2]);
716 std::swap(outRows[5], outRows[6]);
717 // correction steps follow below (if required)
718 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
719 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
720 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
721 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
722}
723
724template <typename T, size_t SIMD_WIDTH>
725static SIMD_INLINE void transpose1inplcLane(
726 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
727 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<32>)
728{
729 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
730 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
731 // correction steps follow below (if required)
732 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
733 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
734}
735
736template <typename T, size_t SIMD_WIDTH>
737static SIMD_INLINE void transpose1inplcLane(
738 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
739 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<64>,
740 Bytes<64>)
741{
742 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
743 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
744 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
745 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
746 zip16<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
747 zip16<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
748 zip16<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
749 zip16<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
750 zip16<1>(inRows[16], inRows[17], outRows[16], outRows[17]);
751 zip16<1>(inRows[18], inRows[19], outRows[18], outRows[19]);
752 zip16<1>(inRows[20], inRows[21], outRows[20], outRows[21]);
753 zip16<1>(inRows[22], inRows[23], outRows[22], outRows[23]);
754 zip16<1>(inRows[24], inRows[25], outRows[24], outRows[25]);
755 zip16<1>(inRows[26], inRows[27], outRows[26], outRows[27]);
756 zip16<1>(inRows[28], inRows[29], outRows[28], outRows[29]);
757 zip16<1>(inRows[30], inRows[31], outRows[30], outRows[31]);
758 zip16<1>(inRows[32], inRows[33], outRows[32], outRows[33]);
759 zip16<1>(inRows[34], inRows[35], outRows[34], outRows[35]);
760 zip16<1>(inRows[36], inRows[37], outRows[36], outRows[37]);
761 zip16<1>(inRows[38], inRows[39], outRows[38], outRows[39]);
762 zip16<1>(inRows[40], inRows[41], outRows[40], outRows[41]);
763 zip16<1>(inRows[42], inRows[43], outRows[42], outRows[43]);
764 zip16<1>(inRows[44], inRows[45], outRows[44], outRows[45]);
765 zip16<1>(inRows[46], inRows[47], outRows[46], outRows[47]);
766 zip16<1>(inRows[48], inRows[49], outRows[48], outRows[49]);
767 zip16<1>(inRows[50], inRows[51], outRows[50], outRows[51]);
768 zip16<1>(inRows[52], inRows[53], outRows[52], outRows[53]);
769 zip16<1>(inRows[54], inRows[55], outRows[54], outRows[55]);
770 zip16<1>(inRows[56], inRows[57], outRows[56], outRows[57]);
771 zip16<1>(inRows[58], inRows[59], outRows[58], outRows[59]);
772 zip16<1>(inRows[60], inRows[61], outRows[60], outRows[61]);
773 zip16<1>(inRows[62], inRows[63], outRows[62], outRows[63]);
774 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
775 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
776 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
777 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
778 zip16<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
779 zip16<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
780 zip16<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
781 zip16<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
782 zip16<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
783 zip16<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
784 zip16<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
785 zip16<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
786 zip16<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
787 zip16<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
788 zip16<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
789 zip16<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
790 zip16<2>(outRows[32], outRows[34], outRows[32], outRows[34]);
791 zip16<2>(outRows[33], outRows[35], outRows[33], outRows[35]);
792 zip16<2>(outRows[36], outRows[38], outRows[36], outRows[38]);
793 zip16<2>(outRows[37], outRows[39], outRows[37], outRows[39]);
794 zip16<2>(outRows[40], outRows[42], outRows[40], outRows[42]);
795 zip16<2>(outRows[41], outRows[43], outRows[41], outRows[43]);
796 zip16<2>(outRows[44], outRows[46], outRows[44], outRows[46]);
797 zip16<2>(outRows[45], outRows[47], outRows[45], outRows[47]);
798 zip16<2>(outRows[48], outRows[50], outRows[48], outRows[50]);
799 zip16<2>(outRows[49], outRows[51], outRows[49], outRows[51]);
800 zip16<2>(outRows[52], outRows[54], outRows[52], outRows[54]);
801 zip16<2>(outRows[53], outRows[55], outRows[53], outRows[55]);
802 zip16<2>(outRows[56], outRows[58], outRows[56], outRows[58]);
803 zip16<2>(outRows[57], outRows[59], outRows[57], outRows[59]);
804 zip16<2>(outRows[60], outRows[62], outRows[60], outRows[62]);
805 zip16<2>(outRows[61], outRows[63], outRows[61], outRows[63]);
806 zip16<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
807 zip16<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
808 zip16<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
809 zip16<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
810 zip16<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
811 zip16<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
812 zip16<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
813 zip16<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
814 zip16<4>(outRows[16], outRows[20], outRows[16], outRows[20]);
815 zip16<4>(outRows[18], outRows[22], outRows[18], outRows[22]);
816 zip16<4>(outRows[17], outRows[21], outRows[17], outRows[21]);
817 zip16<4>(outRows[19], outRows[23], outRows[19], outRows[23]);
818 zip16<4>(outRows[24], outRows[28], outRows[24], outRows[28]);
819 zip16<4>(outRows[26], outRows[30], outRows[26], outRows[30]);
820 zip16<4>(outRows[25], outRows[29], outRows[25], outRows[29]);
821 zip16<4>(outRows[27], outRows[31], outRows[27], outRows[31]);
822 zip16<4>(outRows[32], outRows[36], outRows[32], outRows[36]);
823 zip16<4>(outRows[34], outRows[38], outRows[34], outRows[38]);
824 zip16<4>(outRows[33], outRows[37], outRows[33], outRows[37]);
825 zip16<4>(outRows[35], outRows[39], outRows[35], outRows[39]);
826 zip16<4>(outRows[40], outRows[44], outRows[40], outRows[44]);
827 zip16<4>(outRows[42], outRows[46], outRows[42], outRows[46]);
828 zip16<4>(outRows[41], outRows[45], outRows[41], outRows[45]);
829 zip16<4>(outRows[43], outRows[47], outRows[43], outRows[47]);
830 zip16<4>(outRows[48], outRows[52], outRows[48], outRows[52]);
831 zip16<4>(outRows[50], outRows[54], outRows[50], outRows[54]);
832 zip16<4>(outRows[49], outRows[53], outRows[49], outRows[53]);
833 zip16<4>(outRows[51], outRows[55], outRows[51], outRows[55]);
834 zip16<4>(outRows[56], outRows[60], outRows[56], outRows[60]);
835 zip16<4>(outRows[58], outRows[62], outRows[58], outRows[62]);
836 zip16<4>(outRows[57], outRows[61], outRows[57], outRows[61]);
837 zip16<4>(outRows[59], outRows[63], outRows[59], outRows[63]);
838 zip16<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
839 zip16<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
840 zip16<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
841 zip16<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
842 zip16<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
843 zip16<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
844 zip16<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
845 zip16<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
846 zip16<8>(outRows[16], outRows[24], outRows[16], outRows[24]);
847 zip16<8>(outRows[20], outRows[28], outRows[20], outRows[28]);
848 zip16<8>(outRows[18], outRows[26], outRows[18], outRows[26]);
849 zip16<8>(outRows[22], outRows[30], outRows[22], outRows[30]);
850 zip16<8>(outRows[17], outRows[25], outRows[17], outRows[25]);
851 zip16<8>(outRows[21], outRows[29], outRows[21], outRows[29]);
852 zip16<8>(outRows[19], outRows[27], outRows[19], outRows[27]);
853 zip16<8>(outRows[23], outRows[31], outRows[23], outRows[31]);
854 zip16<8>(outRows[32], outRows[40], outRows[32], outRows[40]);
855 zip16<8>(outRows[36], outRows[44], outRows[36], outRows[44]);
856 zip16<8>(outRows[34], outRows[42], outRows[34], outRows[42]);
857 zip16<8>(outRows[38], outRows[46], outRows[38], outRows[46]);
858 zip16<8>(outRows[33], outRows[41], outRows[33], outRows[41]);
859 zip16<8>(outRows[37], outRows[45], outRows[37], outRows[45]);
860 zip16<8>(outRows[35], outRows[43], outRows[35], outRows[43]);
861 zip16<8>(outRows[39], outRows[47], outRows[39], outRows[47]);
862 zip16<8>(outRows[48], outRows[56], outRows[48], outRows[56]);
863 zip16<8>(outRows[52], outRows[60], outRows[52], outRows[60]);
864 zip16<8>(outRows[50], outRows[58], outRows[50], outRows[58]);
865 zip16<8>(outRows[54], outRows[62], outRows[54], outRows[62]);
866 zip16<8>(outRows[49], outRows[57], outRows[49], outRows[57]);
867 zip16<8>(outRows[53], outRows[61], outRows[53], outRows[61]);
868 zip16<8>(outRows[51], outRows[59], outRows[51], outRows[59]);
869 zip16<8>(outRows[55], outRows[63], outRows[55], outRows[63]);
870 std::swap(outRows[1], outRows[8]);
871 std::swap(outRows[2], outRows[4]);
872 std::swap(outRows[3], outRows[12]);
873 std::swap(outRows[5], outRows[10]);
874 std::swap(outRows[7], outRows[14]);
875 std::swap(outRows[11], outRows[13]);
876 std::swap(outRows[17], outRows[24]);
877 std::swap(outRows[18], outRows[20]);
878 std::swap(outRows[19], outRows[28]);
879 std::swap(outRows[21], outRows[26]);
880 std::swap(outRows[23], outRows[30]);
881 std::swap(outRows[27], outRows[29]);
882 std::swap(outRows[33], outRows[40]);
883 std::swap(outRows[34], outRows[36]);
884 std::swap(outRows[35], outRows[44]);
885 std::swap(outRows[37], outRows[42]);
886 std::swap(outRows[39], outRows[46]);
887 std::swap(outRows[43], outRows[45]);
888 std::swap(outRows[49], outRows[56]);
889 std::swap(outRows[50], outRows[52]);
890 std::swap(outRows[51], outRows[60]);
891 std::swap(outRows[53], outRows[58]);
892 std::swap(outRows[55], outRows[62]);
893 std::swap(outRows[59], outRows[61]);
894 // correction steps follow below (if required)
895 zip<16>(outRows[0], outRows[16], outRows[0], outRows[16]);
896 zip<16>(outRows[1], outRows[17], outRows[1], outRows[17]);
897 zip<16>(outRows[2], outRows[18], outRows[2], outRows[18]);
898 zip<16>(outRows[3], outRows[19], outRows[3], outRows[19]);
899 zip<16>(outRows[4], outRows[20], outRows[4], outRows[20]);
900 zip<16>(outRows[5], outRows[21], outRows[5], outRows[21]);
901 zip<16>(outRows[6], outRows[22], outRows[6], outRows[22]);
902 zip<16>(outRows[7], outRows[23], outRows[7], outRows[23]);
903 zip<16>(outRows[8], outRows[24], outRows[8], outRows[24]);
904 zip<16>(outRows[9], outRows[25], outRows[9], outRows[25]);
905 zip<16>(outRows[10], outRows[26], outRows[10], outRows[26]);
906 zip<16>(outRows[11], outRows[27], outRows[11], outRows[27]);
907 zip<16>(outRows[12], outRows[28], outRows[12], outRows[28]);
908 zip<16>(outRows[13], outRows[29], outRows[13], outRows[29]);
909 zip<16>(outRows[14], outRows[30], outRows[14], outRows[30]);
910 zip<16>(outRows[15], outRows[31], outRows[15], outRows[31]);
911 zip<16>(outRows[32], outRows[48], outRows[32], outRows[48]);
912 zip<16>(outRows[33], outRows[49], outRows[33], outRows[49]);
913 zip<16>(outRows[34], outRows[50], outRows[34], outRows[50]);
914 zip<16>(outRows[35], outRows[51], outRows[35], outRows[51]);
915 zip<16>(outRows[36], outRows[52], outRows[36], outRows[52]);
916 zip<16>(outRows[37], outRows[53], outRows[37], outRows[53]);
917 zip<16>(outRows[38], outRows[54], outRows[38], outRows[54]);
918 zip<16>(outRows[39], outRows[55], outRows[39], outRows[55]);
919 zip<16>(outRows[40], outRows[56], outRows[40], outRows[56]);
920 zip<16>(outRows[41], outRows[57], outRows[41], outRows[57]);
921 zip<16>(outRows[42], outRows[58], outRows[42], outRows[58]);
922 zip<16>(outRows[43], outRows[59], outRows[43], outRows[59]);
923 zip<16>(outRows[44], outRows[60], outRows[44], outRows[60]);
924 zip<16>(outRows[45], outRows[61], outRows[45], outRows[61]);
925 zip<16>(outRows[46], outRows[62], outRows[46], outRows[62]);
926 zip<16>(outRows[47], outRows[63], outRows[47], outRows[63]);
927 zip<32>(outRows[0], outRows[32], outRows[0], outRows[32]);
928 zip<32>(outRows[1], outRows[33], outRows[1], outRows[33]);
929 zip<32>(outRows[2], outRows[34], outRows[2], outRows[34]);
930 zip<32>(outRows[3], outRows[35], outRows[3], outRows[35]);
931 zip<32>(outRows[4], outRows[36], outRows[4], outRows[36]);
932 zip<32>(outRows[5], outRows[37], outRows[5], outRows[37]);
933 zip<32>(outRows[6], outRows[38], outRows[6], outRows[38]);
934 zip<32>(outRows[7], outRows[39], outRows[7], outRows[39]);
935 zip<32>(outRows[8], outRows[40], outRows[8], outRows[40]);
936 zip<32>(outRows[9], outRows[41], outRows[9], outRows[41]);
937 zip<32>(outRows[10], outRows[42], outRows[10], outRows[42]);
938 zip<32>(outRows[11], outRows[43], outRows[11], outRows[43]);
939 zip<32>(outRows[12], outRows[44], outRows[12], outRows[44]);
940 zip<32>(outRows[13], outRows[45], outRows[13], outRows[45]);
941 zip<32>(outRows[14], outRows[46], outRows[14], outRows[46]);
942 zip<32>(outRows[15], outRows[47], outRows[15], outRows[47]);
943 zip<32>(outRows[16], outRows[48], outRows[16], outRows[48]);
944 zip<32>(outRows[17], outRows[49], outRows[17], outRows[49]);
945 zip<32>(outRows[18], outRows[50], outRows[18], outRows[50]);
946 zip<32>(outRows[19], outRows[51], outRows[19], outRows[51]);
947 zip<32>(outRows[20], outRows[52], outRows[20], outRows[52]);
948 zip<32>(outRows[21], outRows[53], outRows[21], outRows[53]);
949 zip<32>(outRows[22], outRows[54], outRows[22], outRows[54]);
950 zip<32>(outRows[23], outRows[55], outRows[23], outRows[55]);
951 zip<32>(outRows[24], outRows[56], outRows[24], outRows[56]);
952 zip<32>(outRows[25], outRows[57], outRows[25], outRows[57]);
953 zip<32>(outRows[26], outRows[58], outRows[26], outRows[58]);
954 zip<32>(outRows[27], outRows[59], outRows[27], outRows[59]);
955 zip<32>(outRows[28], outRows[60], outRows[28], outRows[60]);
956 zip<32>(outRows[29], outRows[61], outRows[29], outRows[61]);
957 zip<32>(outRows[30], outRows[62], outRows[30], outRows[62]);
958 zip<32>(outRows[31], outRows[63], outRows[31], outRows[63]);
959 std::swap(outRows[16], outRows[32]);
960 std::swap(outRows[17], outRows[33]);
961 std::swap(outRows[18], outRows[34]);
962 std::swap(outRows[19], outRows[35]);
963 std::swap(outRows[20], outRows[36]);
964 std::swap(outRows[21], outRows[37]);
965 std::swap(outRows[22], outRows[38]);
966 std::swap(outRows[23], outRows[39]);
967 std::swap(outRows[24], outRows[40]);
968 std::swap(outRows[25], outRows[41]);
969 std::swap(outRows[26], outRows[42]);
970 std::swap(outRows[27], outRows[43]);
971 std::swap(outRows[28], outRows[44]);
972 std::swap(outRows[29], outRows[45]);
973 std::swap(outRows[30], outRows[46]);
974 std::swap(outRows[31], outRows[47]);
975}
976
977template <typename T, size_t SIMD_WIDTH>
978static SIMD_INLINE void transpose1inplcLane(
979 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
980 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<32>,
981 Bytes<64>)
982{
983 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
984 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
985 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
986 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
987 zip16<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
988 zip16<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
989 zip16<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
990 zip16<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
991 zip16<1>(inRows[16], inRows[17], outRows[16], outRows[17]);
992 zip16<1>(inRows[18], inRows[19], outRows[18], outRows[19]);
993 zip16<1>(inRows[20], inRows[21], outRows[20], outRows[21]);
994 zip16<1>(inRows[22], inRows[23], outRows[22], outRows[23]);
995 zip16<1>(inRows[24], inRows[25], outRows[24], outRows[25]);
996 zip16<1>(inRows[26], inRows[27], outRows[26], outRows[27]);
997 zip16<1>(inRows[28], inRows[29], outRows[28], outRows[29]);
998 zip16<1>(inRows[30], inRows[31], outRows[30], outRows[31]);
999 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
1000 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
1001 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
1002 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
1003 zip16<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
1004 zip16<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
1005 zip16<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
1006 zip16<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
1007 zip16<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
1008 zip16<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
1009 zip16<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
1010 zip16<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
1011 zip16<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
1012 zip16<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
1013 zip16<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
1014 zip16<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
1015 zip16<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
1016 zip16<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
1017 zip16<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
1018 zip16<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
1019 zip16<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
1020 zip16<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
1021 zip16<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
1022 zip16<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
1023 zip16<4>(outRows[16], outRows[20], outRows[16], outRows[20]);
1024 zip16<4>(outRows[18], outRows[22], outRows[18], outRows[22]);
1025 zip16<4>(outRows[17], outRows[21], outRows[17], outRows[21]);
1026 zip16<4>(outRows[19], outRows[23], outRows[19], outRows[23]);
1027 zip16<4>(outRows[24], outRows[28], outRows[24], outRows[28]);
1028 zip16<4>(outRows[26], outRows[30], outRows[26], outRows[30]);
1029 zip16<4>(outRows[25], outRows[29], outRows[25], outRows[29]);
1030 zip16<4>(outRows[27], outRows[31], outRows[27], outRows[31]);
1031 std::swap(outRows[1], outRows[4]);
1032 std::swap(outRows[3], outRows[6]);
1033 std::swap(outRows[9], outRows[12]);
1034 std::swap(outRows[11], outRows[14]);
1035 std::swap(outRows[17], outRows[20]);
1036 std::swap(outRows[19], outRows[22]);
1037 std::swap(outRows[25], outRows[28]);
1038 std::swap(outRows[27], outRows[30]);
1039 // correction steps follow below (if required)
1040 zip<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
1041 zip<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
1042 zip<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
1043 zip<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
1044 zip<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
1045 zip<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
1046 zip<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
1047 zip<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
1048 zip<8>(outRows[16], outRows[24], outRows[16], outRows[24]);
1049 zip<8>(outRows[17], outRows[25], outRows[17], outRows[25]);
1050 zip<8>(outRows[18], outRows[26], outRows[18], outRows[26]);
1051 zip<8>(outRows[19], outRows[27], outRows[19], outRows[27]);
1052 zip<8>(outRows[20], outRows[28], outRows[20], outRows[28]);
1053 zip<8>(outRows[21], outRows[29], outRows[21], outRows[29]);
1054 zip<8>(outRows[22], outRows[30], outRows[22], outRows[30]);
1055 zip<8>(outRows[23], outRows[31], outRows[23], outRows[31]);
1056 zip<16>(outRows[0], outRows[16], outRows[0], outRows[16]);
1057 zip<16>(outRows[1], outRows[17], outRows[1], outRows[17]);
1058 zip<16>(outRows[2], outRows[18], outRows[2], outRows[18]);
1059 zip<16>(outRows[3], outRows[19], outRows[3], outRows[19]);
1060 zip<16>(outRows[4], outRows[20], outRows[4], outRows[20]);
1061 zip<16>(outRows[5], outRows[21], outRows[5], outRows[21]);
1062 zip<16>(outRows[6], outRows[22], outRows[6], outRows[22]);
1063 zip<16>(outRows[7], outRows[23], outRows[7], outRows[23]);
1064 zip<16>(outRows[8], outRows[24], outRows[8], outRows[24]);
1065 zip<16>(outRows[9], outRows[25], outRows[9], outRows[25]);
1066 zip<16>(outRows[10], outRows[26], outRows[10], outRows[26]);
1067 zip<16>(outRows[11], outRows[27], outRows[11], outRows[27]);
1068 zip<16>(outRows[12], outRows[28], outRows[12], outRows[28]);
1069 zip<16>(outRows[13], outRows[29], outRows[13], outRows[29]);
1070 zip<16>(outRows[14], outRows[30], outRows[14], outRows[30]);
1071 zip<16>(outRows[15], outRows[31], outRows[15], outRows[31]);
1072 std::swap(outRows[8], outRows[16]);
1073 std::swap(outRows[9], outRows[17]);
1074 std::swap(outRows[10], outRows[18]);
1075 std::swap(outRows[11], outRows[19]);
1076 std::swap(outRows[12], outRows[20]);
1077 std::swap(outRows[13], outRows[21]);
1078 std::swap(outRows[14], outRows[22]);
1079 std::swap(outRows[15], outRows[23]);
1080}
1081
1082template <typename T, size_t SIMD_WIDTH>
1083static SIMD_INLINE void transpose1inplcLane(
1084 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1085 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>,
1086 Bytes<64>)
1087{
1088 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
1089 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
1090 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
1091 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
1092 zip16<1>(inRows[8], inRows[9], outRows[8], outRows[9]);
1093 zip16<1>(inRows[10], inRows[11], outRows[10], outRows[11]);
1094 zip16<1>(inRows[12], inRows[13], outRows[12], outRows[13]);
1095 zip16<1>(inRows[14], inRows[15], outRows[14], outRows[15]);
1096 zip16<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
1097 zip16<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
1098 zip16<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
1099 zip16<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
1100 zip16<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
1101 zip16<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
1102 zip16<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
1103 zip16<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
1104 std::swap(outRows[1], outRows[2]);
1105 std::swap(outRows[5], outRows[6]);
1106 std::swap(outRows[9], outRows[10]);
1107 std::swap(outRows[13], outRows[14]);
1108 // correction steps follow below (if required)
1109 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
1110 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
1111 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
1112 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
1113 zip<4>(outRows[8], outRows[12], outRows[8], outRows[12]);
1114 zip<4>(outRows[9], outRows[13], outRows[9], outRows[13]);
1115 zip<4>(outRows[10], outRows[14], outRows[10], outRows[14]);
1116 zip<4>(outRows[11], outRows[15], outRows[11], outRows[15]);
1117 zip<8>(outRows[0], outRows[8], outRows[0], outRows[8]);
1118 zip<8>(outRows[1], outRows[9], outRows[1], outRows[9]);
1119 zip<8>(outRows[2], outRows[10], outRows[2], outRows[10]);
1120 zip<8>(outRows[3], outRows[11], outRows[3], outRows[11]);
1121 zip<8>(outRows[4], outRows[12], outRows[4], outRows[12]);
1122 zip<8>(outRows[5], outRows[13], outRows[5], outRows[13]);
1123 zip<8>(outRows[6], outRows[14], outRows[6], outRows[14]);
1124 zip<8>(outRows[7], outRows[15], outRows[7], outRows[15]);
1125 std::swap(outRows[4], outRows[8]);
1126 std::swap(outRows[5], outRows[9]);
1127 std::swap(outRows[6], outRows[10]);
1128 std::swap(outRows[7], outRows[11]);
1129}
1130
1131template <typename T, size_t SIMD_WIDTH>
1132static SIMD_INLINE void transpose1inplcLane(
1133 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1134 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<64>)
1135{
1136 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
1137 zip16<1>(inRows[2], inRows[3], outRows[2], outRows[3]);
1138 zip16<1>(inRows[4], inRows[5], outRows[4], outRows[5]);
1139 zip16<1>(inRows[6], inRows[7], outRows[6], outRows[7]);
1140 // correction steps follow below (if required)
1141 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
1142 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
1143 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
1144 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
1145 zip<4>(outRows[0], outRows[4], outRows[0], outRows[4]);
1146 zip<4>(outRows[1], outRows[5], outRows[1], outRows[5]);
1147 zip<4>(outRows[2], outRows[6], outRows[2], outRows[6]);
1148 zip<4>(outRows[3], outRows[7], outRows[3], outRows[7]);
1149 std::swap(outRows[2], outRows[4]);
1150 std::swap(outRows[3], outRows[5]);
1151}
1152
1153template <typename T, size_t SIMD_WIDTH>
1154static SIMD_INLINE void transpose1inplcLane(
1155 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1156 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
1157{
1158 transpose1inplcLane(inRows, outRows, Elements<Vec<T, SIMD_WIDTH>::elements>(),
1159 Bytes<SIMD_WIDTH>());
1160}
1161
1162// ==========================================================
1163// transpose2inplc
1164// ==========================================================
1165
1166template <typename T, size_t SIMD_WIDTH>
1167static SIMD_INLINE void transpose2inplc(
1168 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1169 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<2>)
1170{
1171 zip<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
1172}
1173
1174template <typename T, size_t SIMD_WIDTH>
1175static SIMD_INLINE void transpose2inplc(
1176 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1177 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<4>)
1178{
1179 zip<1>(inRows[0], inRows[2], outRows[0], outRows[2]);
1180 zip<1>(inRows[1], inRows[3], outRows[1], outRows[3]);
1181 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1182 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1183}
1184
1185template <typename T, size_t SIMD_WIDTH>
1186static SIMD_INLINE void transpose2inplc(
1187 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1188 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>)
1189{
1190 zip<1>(inRows[0], inRows[4], outRows[0], outRows[4]);
1191 zip<1>(inRows[1], inRows[5], outRows[1], outRows[5]);
1192 zip<1>(inRows[2], inRows[6], outRows[2], outRows[6]);
1193 zip<1>(inRows[3], inRows[7], outRows[3], outRows[7]);
1194 zip<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1195 zip<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1196 zip<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1197 zip<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1198 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1199 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1200 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1201 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1202}
1203
1204template <typename T, size_t SIMD_WIDTH>
1205static SIMD_INLINE void transpose2inplc(
1206 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1207 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>)
1208{
1209 zip<1>(inRows[0], inRows[8], outRows[0], outRows[8]);
1210 zip<1>(inRows[1], inRows[9], outRows[1], outRows[9]);
1211 zip<1>(inRows[2], inRows[10], outRows[2], outRows[10]);
1212 zip<1>(inRows[3], inRows[11], outRows[3], outRows[11]);
1213 zip<1>(inRows[4], inRows[12], outRows[4], outRows[12]);
1214 zip<1>(inRows[5], inRows[13], outRows[5], outRows[13]);
1215 zip<1>(inRows[6], inRows[14], outRows[6], outRows[14]);
1216 zip<1>(inRows[7], inRows[15], outRows[7], outRows[15]);
1217 zip<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1218 zip<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1219 zip<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1220 zip<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1221 zip<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1222 zip<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1223 zip<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
1224 zip<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
1225 zip<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1226 zip<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1227 zip<1>(outRows[8], outRows[10], outRows[8], outRows[10]);
1228 zip<1>(outRows[12], outRows[14], outRows[12], outRows[14]);
1229 zip<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1230 zip<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1231 zip<1>(outRows[9], outRows[11], outRows[9], outRows[11]);
1232 zip<1>(outRows[13], outRows[15], outRows[13], outRows[15]);
1233 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1234 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1235 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1236 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1237 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
1238 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
1239 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
1240 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
1241}
1242
1243template <typename T, size_t SIMD_WIDTH>
1244static SIMD_INLINE void transpose2inplc(
1245 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1246 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<32>)
1247{
1248 zip<1>(inRows[0], inRows[16], outRows[0], outRows[16]);
1249 zip<1>(inRows[1], inRows[17], outRows[1], outRows[17]);
1250 zip<1>(inRows[2], inRows[18], outRows[2], outRows[18]);
1251 zip<1>(inRows[3], inRows[19], outRows[3], outRows[19]);
1252 zip<1>(inRows[4], inRows[20], outRows[4], outRows[20]);
1253 zip<1>(inRows[5], inRows[21], outRows[5], outRows[21]);
1254 zip<1>(inRows[6], inRows[22], outRows[6], outRows[22]);
1255 zip<1>(inRows[7], inRows[23], outRows[7], outRows[23]);
1256 zip<1>(inRows[8], inRows[24], outRows[8], outRows[24]);
1257 zip<1>(inRows[9], inRows[25], outRows[9], outRows[25]);
1258 zip<1>(inRows[10], inRows[26], outRows[10], outRows[26]);
1259 zip<1>(inRows[11], inRows[27], outRows[11], outRows[27]);
1260 zip<1>(inRows[12], inRows[28], outRows[12], outRows[28]);
1261 zip<1>(inRows[13], inRows[29], outRows[13], outRows[29]);
1262 zip<1>(inRows[14], inRows[30], outRows[14], outRows[30]);
1263 zip<1>(inRows[15], inRows[31], outRows[15], outRows[31]);
1264 zip<1>(outRows[0], outRows[8], outRows[0], outRows[8]);
1265 zip<1>(outRows[16], outRows[24], outRows[16], outRows[24]);
1266 zip<1>(outRows[1], outRows[9], outRows[1], outRows[9]);
1267 zip<1>(outRows[17], outRows[25], outRows[17], outRows[25]);
1268 zip<1>(outRows[2], outRows[10], outRows[2], outRows[10]);
1269 zip<1>(outRows[18], outRows[26], outRows[18], outRows[26]);
1270 zip<1>(outRows[3], outRows[11], outRows[3], outRows[11]);
1271 zip<1>(outRows[19], outRows[27], outRows[19], outRows[27]);
1272 zip<1>(outRows[4], outRows[12], outRows[4], outRows[12]);
1273 zip<1>(outRows[20], outRows[28], outRows[20], outRows[28]);
1274 zip<1>(outRows[5], outRows[13], outRows[5], outRows[13]);
1275 zip<1>(outRows[21], outRows[29], outRows[21], outRows[29]);
1276 zip<1>(outRows[6], outRows[14], outRows[6], outRows[14]);
1277 zip<1>(outRows[22], outRows[30], outRows[22], outRows[30]);
1278 zip<1>(outRows[7], outRows[15], outRows[7], outRows[15]);
1279 zip<1>(outRows[23], outRows[31], outRows[23], outRows[31]);
1280 zip<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1281 zip<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1282 zip<1>(outRows[16], outRows[20], outRows[16], outRows[20]);
1283 zip<1>(outRows[24], outRows[28], outRows[24], outRows[28]);
1284 zip<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1285 zip<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1286 zip<1>(outRows[17], outRows[21], outRows[17], outRows[21]);
1287 zip<1>(outRows[25], outRows[29], outRows[25], outRows[29]);
1288 zip<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1289 zip<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1290 zip<1>(outRows[18], outRows[22], outRows[18], outRows[22]);
1291 zip<1>(outRows[26], outRows[30], outRows[26], outRows[30]);
1292 zip<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
1293 zip<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
1294 zip<1>(outRows[19], outRows[23], outRows[19], outRows[23]);
1295 zip<1>(outRows[27], outRows[31], outRows[27], outRows[31]);
1296 zip<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1297 zip<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1298 zip<1>(outRows[8], outRows[10], outRows[8], outRows[10]);
1299 zip<1>(outRows[12], outRows[14], outRows[12], outRows[14]);
1300 zip<1>(outRows[16], outRows[18], outRows[16], outRows[18]);
1301 zip<1>(outRows[20], outRows[22], outRows[20], outRows[22]);
1302 zip<1>(outRows[24], outRows[26], outRows[24], outRows[26]);
1303 zip<1>(outRows[28], outRows[30], outRows[28], outRows[30]);
1304 zip<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1305 zip<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1306 zip<1>(outRows[9], outRows[11], outRows[9], outRows[11]);
1307 zip<1>(outRows[13], outRows[15], outRows[13], outRows[15]);
1308 zip<1>(outRows[17], outRows[19], outRows[17], outRows[19]);
1309 zip<1>(outRows[21], outRows[23], outRows[21], outRows[23]);
1310 zip<1>(outRows[25], outRows[27], outRows[25], outRows[27]);
1311 zip<1>(outRows[29], outRows[31], outRows[29], outRows[31]);
1312 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1313 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1314 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1315 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1316 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
1317 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
1318 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
1319 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
1320 zip<1>(outRows[16], outRows[17], outRows[16], outRows[17]);
1321 zip<1>(outRows[18], outRows[19], outRows[18], outRows[19]);
1322 zip<1>(outRows[20], outRows[21], outRows[20], outRows[21]);
1323 zip<1>(outRows[22], outRows[23], outRows[22], outRows[23]);
1324 zip<1>(outRows[24], outRows[25], outRows[24], outRows[25]);
1325 zip<1>(outRows[26], outRows[27], outRows[26], outRows[27]);
1326 zip<1>(outRows[28], outRows[29], outRows[28], outRows[29]);
1327 zip<1>(outRows[30], outRows[31], outRows[30], outRows[31]);
1328}
1329
1330template <typename T, size_t SIMD_WIDTH>
1331static SIMD_INLINE void transpose2inplc(
1332 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1333 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<64>)
1334{
1335 zip<1>(inRows[0], inRows[32], outRows[0], outRows[32]);
1336 zip<1>(inRows[1], inRows[33], outRows[1], outRows[33]);
1337 zip<1>(inRows[2], inRows[34], outRows[2], outRows[34]);
1338 zip<1>(inRows[3], inRows[35], outRows[3], outRows[35]);
1339 zip<1>(inRows[4], inRows[36], outRows[4], outRows[36]);
1340 zip<1>(inRows[5], inRows[37], outRows[5], outRows[37]);
1341 zip<1>(inRows[6], inRows[38], outRows[6], outRows[38]);
1342 zip<1>(inRows[7], inRows[39], outRows[7], outRows[39]);
1343 zip<1>(inRows[8], inRows[40], outRows[8], outRows[40]);
1344 zip<1>(inRows[9], inRows[41], outRows[9], outRows[41]);
1345 zip<1>(inRows[10], inRows[42], outRows[10], outRows[42]);
1346 zip<1>(inRows[11], inRows[43], outRows[11], outRows[43]);
1347 zip<1>(inRows[12], inRows[44], outRows[12], outRows[44]);
1348 zip<1>(inRows[13], inRows[45], outRows[13], outRows[45]);
1349 zip<1>(inRows[14], inRows[46], outRows[14], outRows[46]);
1350 zip<1>(inRows[15], inRows[47], outRows[15], outRows[47]);
1351 zip<1>(inRows[16], inRows[48], outRows[16], outRows[48]);
1352 zip<1>(inRows[17], inRows[49], outRows[17], outRows[49]);
1353 zip<1>(inRows[18], inRows[50], outRows[18], outRows[50]);
1354 zip<1>(inRows[19], inRows[51], outRows[19], outRows[51]);
1355 zip<1>(inRows[20], inRows[52], outRows[20], outRows[52]);
1356 zip<1>(inRows[21], inRows[53], outRows[21], outRows[53]);
1357 zip<1>(inRows[22], inRows[54], outRows[22], outRows[54]);
1358 zip<1>(inRows[23], inRows[55], outRows[23], outRows[55]);
1359 zip<1>(inRows[24], inRows[56], outRows[24], outRows[56]);
1360 zip<1>(inRows[25], inRows[57], outRows[25], outRows[57]);
1361 zip<1>(inRows[26], inRows[58], outRows[26], outRows[58]);
1362 zip<1>(inRows[27], inRows[59], outRows[27], outRows[59]);
1363 zip<1>(inRows[28], inRows[60], outRows[28], outRows[60]);
1364 zip<1>(inRows[29], inRows[61], outRows[29], outRows[61]);
1365 zip<1>(inRows[30], inRows[62], outRows[30], outRows[62]);
1366 zip<1>(inRows[31], inRows[63], outRows[31], outRows[63]);
1367 zip<1>(outRows[0], outRows[16], outRows[0], outRows[16]);
1368 zip<1>(outRows[32], outRows[48], outRows[32], outRows[48]);
1369 zip<1>(outRows[1], outRows[17], outRows[1], outRows[17]);
1370 zip<1>(outRows[33], outRows[49], outRows[33], outRows[49]);
1371 zip<1>(outRows[2], outRows[18], outRows[2], outRows[18]);
1372 zip<1>(outRows[34], outRows[50], outRows[34], outRows[50]);
1373 zip<1>(outRows[3], outRows[19], outRows[3], outRows[19]);
1374 zip<1>(outRows[35], outRows[51], outRows[35], outRows[51]);
1375 zip<1>(outRows[4], outRows[20], outRows[4], outRows[20]);
1376 zip<1>(outRows[36], outRows[52], outRows[36], outRows[52]);
1377 zip<1>(outRows[5], outRows[21], outRows[5], outRows[21]);
1378 zip<1>(outRows[37], outRows[53], outRows[37], outRows[53]);
1379 zip<1>(outRows[6], outRows[22], outRows[6], outRows[22]);
1380 zip<1>(outRows[38], outRows[54], outRows[38], outRows[54]);
1381 zip<1>(outRows[7], outRows[23], outRows[7], outRows[23]);
1382 zip<1>(outRows[39], outRows[55], outRows[39], outRows[55]);
1383 zip<1>(outRows[8], outRows[24], outRows[8], outRows[24]);
1384 zip<1>(outRows[40], outRows[56], outRows[40], outRows[56]);
1385 zip<1>(outRows[9], outRows[25], outRows[9], outRows[25]);
1386 zip<1>(outRows[41], outRows[57], outRows[41], outRows[57]);
1387 zip<1>(outRows[10], outRows[26], outRows[10], outRows[26]);
1388 zip<1>(outRows[42], outRows[58], outRows[42], outRows[58]);
1389 zip<1>(outRows[11], outRows[27], outRows[11], outRows[27]);
1390 zip<1>(outRows[43], outRows[59], outRows[43], outRows[59]);
1391 zip<1>(outRows[12], outRows[28], outRows[12], outRows[28]);
1392 zip<1>(outRows[44], outRows[60], outRows[44], outRows[60]);
1393 zip<1>(outRows[13], outRows[29], outRows[13], outRows[29]);
1394 zip<1>(outRows[45], outRows[61], outRows[45], outRows[61]);
1395 zip<1>(outRows[14], outRows[30], outRows[14], outRows[30]);
1396 zip<1>(outRows[46], outRows[62], outRows[46], outRows[62]);
1397 zip<1>(outRows[15], outRows[31], outRows[15], outRows[31]);
1398 zip<1>(outRows[47], outRows[63], outRows[47], outRows[63]);
1399 zip<1>(outRows[0], outRows[8], outRows[0], outRows[8]);
1400 zip<1>(outRows[16], outRows[24], outRows[16], outRows[24]);
1401 zip<1>(outRows[32], outRows[40], outRows[32], outRows[40]);
1402 zip<1>(outRows[48], outRows[56], outRows[48], outRows[56]);
1403 zip<1>(outRows[1], outRows[9], outRows[1], outRows[9]);
1404 zip<1>(outRows[17], outRows[25], outRows[17], outRows[25]);
1405 zip<1>(outRows[33], outRows[41], outRows[33], outRows[41]);
1406 zip<1>(outRows[49], outRows[57], outRows[49], outRows[57]);
1407 zip<1>(outRows[2], outRows[10], outRows[2], outRows[10]);
1408 zip<1>(outRows[18], outRows[26], outRows[18], outRows[26]);
1409 zip<1>(outRows[34], outRows[42], outRows[34], outRows[42]);
1410 zip<1>(outRows[50], outRows[58], outRows[50], outRows[58]);
1411 zip<1>(outRows[3], outRows[11], outRows[3], outRows[11]);
1412 zip<1>(outRows[19], outRows[27], outRows[19], outRows[27]);
1413 zip<1>(outRows[35], outRows[43], outRows[35], outRows[43]);
1414 zip<1>(outRows[51], outRows[59], outRows[51], outRows[59]);
1415 zip<1>(outRows[4], outRows[12], outRows[4], outRows[12]);
1416 zip<1>(outRows[20], outRows[28], outRows[20], outRows[28]);
1417 zip<1>(outRows[36], outRows[44], outRows[36], outRows[44]);
1418 zip<1>(outRows[52], outRows[60], outRows[52], outRows[60]);
1419 zip<1>(outRows[5], outRows[13], outRows[5], outRows[13]);
1420 zip<1>(outRows[21], outRows[29], outRows[21], outRows[29]);
1421 zip<1>(outRows[37], outRows[45], outRows[37], outRows[45]);
1422 zip<1>(outRows[53], outRows[61], outRows[53], outRows[61]);
1423 zip<1>(outRows[6], outRows[14], outRows[6], outRows[14]);
1424 zip<1>(outRows[22], outRows[30], outRows[22], outRows[30]);
1425 zip<1>(outRows[38], outRows[46], outRows[38], outRows[46]);
1426 zip<1>(outRows[54], outRows[62], outRows[54], outRows[62]);
1427 zip<1>(outRows[7], outRows[15], outRows[7], outRows[15]);
1428 zip<1>(outRows[23], outRows[31], outRows[23], outRows[31]);
1429 zip<1>(outRows[39], outRows[47], outRows[39], outRows[47]);
1430 zip<1>(outRows[55], outRows[63], outRows[55], outRows[63]);
1431 zip<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1432 zip<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1433 zip<1>(outRows[16], outRows[20], outRows[16], outRows[20]);
1434 zip<1>(outRows[24], outRows[28], outRows[24], outRows[28]);
1435 zip<1>(outRows[32], outRows[36], outRows[32], outRows[36]);
1436 zip<1>(outRows[40], outRows[44], outRows[40], outRows[44]);
1437 zip<1>(outRows[48], outRows[52], outRows[48], outRows[52]);
1438 zip<1>(outRows[56], outRows[60], outRows[56], outRows[60]);
1439 zip<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1440 zip<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1441 zip<1>(outRows[17], outRows[21], outRows[17], outRows[21]);
1442 zip<1>(outRows[25], outRows[29], outRows[25], outRows[29]);
1443 zip<1>(outRows[33], outRows[37], outRows[33], outRows[37]);
1444 zip<1>(outRows[41], outRows[45], outRows[41], outRows[45]);
1445 zip<1>(outRows[49], outRows[53], outRows[49], outRows[53]);
1446 zip<1>(outRows[57], outRows[61], outRows[57], outRows[61]);
1447 zip<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1448 zip<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1449 zip<1>(outRows[18], outRows[22], outRows[18], outRows[22]);
1450 zip<1>(outRows[26], outRows[30], outRows[26], outRows[30]);
1451 zip<1>(outRows[34], outRows[38], outRows[34], outRows[38]);
1452 zip<1>(outRows[42], outRows[46], outRows[42], outRows[46]);
1453 zip<1>(outRows[50], outRows[54], outRows[50], outRows[54]);
1454 zip<1>(outRows[58], outRows[62], outRows[58], outRows[62]);
1455 zip<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
1456 zip<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
1457 zip<1>(outRows[19], outRows[23], outRows[19], outRows[23]);
1458 zip<1>(outRows[27], outRows[31], outRows[27], outRows[31]);
1459 zip<1>(outRows[35], outRows[39], outRows[35], outRows[39]);
1460 zip<1>(outRows[43], outRows[47], outRows[43], outRows[47]);
1461 zip<1>(outRows[51], outRows[55], outRows[51], outRows[55]);
1462 zip<1>(outRows[59], outRows[63], outRows[59], outRows[63]);
1463 zip<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1464 zip<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1465 zip<1>(outRows[8], outRows[10], outRows[8], outRows[10]);
1466 zip<1>(outRows[12], outRows[14], outRows[12], outRows[14]);
1467 zip<1>(outRows[16], outRows[18], outRows[16], outRows[18]);
1468 zip<1>(outRows[20], outRows[22], outRows[20], outRows[22]);
1469 zip<1>(outRows[24], outRows[26], outRows[24], outRows[26]);
1470 zip<1>(outRows[28], outRows[30], outRows[28], outRows[30]);
1471 zip<1>(outRows[32], outRows[34], outRows[32], outRows[34]);
1472 zip<1>(outRows[36], outRows[38], outRows[36], outRows[38]);
1473 zip<1>(outRows[40], outRows[42], outRows[40], outRows[42]);
1474 zip<1>(outRows[44], outRows[46], outRows[44], outRows[46]);
1475 zip<1>(outRows[48], outRows[50], outRows[48], outRows[50]);
1476 zip<1>(outRows[52], outRows[54], outRows[52], outRows[54]);
1477 zip<1>(outRows[56], outRows[58], outRows[56], outRows[58]);
1478 zip<1>(outRows[60], outRows[62], outRows[60], outRows[62]);
1479 zip<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1480 zip<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1481 zip<1>(outRows[9], outRows[11], outRows[9], outRows[11]);
1482 zip<1>(outRows[13], outRows[15], outRows[13], outRows[15]);
1483 zip<1>(outRows[17], outRows[19], outRows[17], outRows[19]);
1484 zip<1>(outRows[21], outRows[23], outRows[21], outRows[23]);
1485 zip<1>(outRows[25], outRows[27], outRows[25], outRows[27]);
1486 zip<1>(outRows[29], outRows[31], outRows[29], outRows[31]);
1487 zip<1>(outRows[33], outRows[35], outRows[33], outRows[35]);
1488 zip<1>(outRows[37], outRows[39], outRows[37], outRows[39]);
1489 zip<1>(outRows[41], outRows[43], outRows[41], outRows[43]);
1490 zip<1>(outRows[45], outRows[47], outRows[45], outRows[47]);
1491 zip<1>(outRows[49], outRows[51], outRows[49], outRows[51]);
1492 zip<1>(outRows[53], outRows[55], outRows[53], outRows[55]);
1493 zip<1>(outRows[57], outRows[59], outRows[57], outRows[59]);
1494 zip<1>(outRows[61], outRows[63], outRows[61], outRows[63]);
1495 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1496 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1497 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1498 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1499 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
1500 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
1501 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
1502 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
1503 zip<1>(outRows[16], outRows[17], outRows[16], outRows[17]);
1504 zip<1>(outRows[18], outRows[19], outRows[18], outRows[19]);
1505 zip<1>(outRows[20], outRows[21], outRows[20], outRows[21]);
1506 zip<1>(outRows[22], outRows[23], outRows[22], outRows[23]);
1507 zip<1>(outRows[24], outRows[25], outRows[24], outRows[25]);
1508 zip<1>(outRows[26], outRows[27], outRows[26], outRows[27]);
1509 zip<1>(outRows[28], outRows[29], outRows[28], outRows[29]);
1510 zip<1>(outRows[30], outRows[31], outRows[30], outRows[31]);
1511 zip<1>(outRows[32], outRows[33], outRows[32], outRows[33]);
1512 zip<1>(outRows[34], outRows[35], outRows[34], outRows[35]);
1513 zip<1>(outRows[36], outRows[37], outRows[36], outRows[37]);
1514 zip<1>(outRows[38], outRows[39], outRows[38], outRows[39]);
1515 zip<1>(outRows[40], outRows[41], outRows[40], outRows[41]);
1516 zip<1>(outRows[42], outRows[43], outRows[42], outRows[43]);
1517 zip<1>(outRows[44], outRows[45], outRows[44], outRows[45]);
1518 zip<1>(outRows[46], outRows[47], outRows[46], outRows[47]);
1519 zip<1>(outRows[48], outRows[49], outRows[48], outRows[49]);
1520 zip<1>(outRows[50], outRows[51], outRows[50], outRows[51]);
1521 zip<1>(outRows[52], outRows[53], outRows[52], outRows[53]);
1522 zip<1>(outRows[54], outRows[55], outRows[54], outRows[55]);
1523 zip<1>(outRows[56], outRows[57], outRows[56], outRows[57]);
1524 zip<1>(outRows[58], outRows[59], outRows[58], outRows[59]);
1525 zip<1>(outRows[60], outRows[61], outRows[60], outRows[61]);
1526 zip<1>(outRows[62], outRows[63], outRows[62], outRows[63]);
1527}
1528
1529template <typename T, size_t SIMD_WIDTH>
1530static SIMD_INLINE void transpose2inplc(
1531 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1532 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
1533{
1534 transpose2inplc(inRows, outRows, Elements<Vec<T, SIMD_WIDTH>::elements>());
1535}
1536
1537// ==========================================================
1538// transpose2inplcLane
1539// ==========================================================
1540
1541template <typename T, size_t SIMD_WIDTH>
1542static SIMD_INLINE void transpose2inplcLane(
1543 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1544 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>,
1545 Bytes<16>)
1546{
1547 zip16<1>(inRows[0], inRows[8], outRows[0], outRows[8]);
1548 zip16<1>(inRows[1], inRows[9], outRows[1], outRows[9]);
1549 zip16<1>(inRows[2], inRows[10], outRows[2], outRows[10]);
1550 zip16<1>(inRows[3], inRows[11], outRows[3], outRows[11]);
1551 zip16<1>(inRows[4], inRows[12], outRows[4], outRows[12]);
1552 zip16<1>(inRows[5], inRows[13], outRows[5], outRows[13]);
1553 zip16<1>(inRows[6], inRows[14], outRows[6], outRows[14]);
1554 zip16<1>(inRows[7], inRows[15], outRows[7], outRows[15]);
1555 zip16<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1556 zip16<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1557 zip16<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1558 zip16<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1559 zip16<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1560 zip16<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1561 zip16<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
1562 zip16<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
1563 zip16<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1564 zip16<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1565 zip16<1>(outRows[8], outRows[10], outRows[8], outRows[10]);
1566 zip16<1>(outRows[12], outRows[14], outRows[12], outRows[14]);
1567 zip16<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1568 zip16<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1569 zip16<1>(outRows[9], outRows[11], outRows[9], outRows[11]);
1570 zip16<1>(outRows[13], outRows[15], outRows[13], outRows[15]);
1571 zip16<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1572 zip16<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1573 zip16<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1574 zip16<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1575 zip16<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
1576 zip16<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
1577 zip16<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
1578 zip16<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
1579 // correction steps follow below (if required)
1580}
1581
1582template <typename T, size_t SIMD_WIDTH>
1583static SIMD_INLINE void transpose2inplcLane(
1584 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1585 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<16>)
1586{
1587 zip16<1>(inRows[0], inRows[4], outRows[0], outRows[4]);
1588 zip16<1>(inRows[1], inRows[5], outRows[1], outRows[5]);
1589 zip16<1>(inRows[2], inRows[6], outRows[2], outRows[6]);
1590 zip16<1>(inRows[3], inRows[7], outRows[3], outRows[7]);
1591 zip16<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1592 zip16<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1593 zip16<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1594 zip16<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1595 zip16<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1596 zip16<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1597 zip16<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1598 zip16<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1599 // correction steps follow below (if required)
1600}
1601
1602template <typename T, size_t SIMD_WIDTH>
1603static SIMD_INLINE void transpose2inplcLane(
1604 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1605 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<16>)
1606{
1607 zip16<1>(inRows[0], inRows[2], outRows[0], outRows[2]);
1608 zip16<1>(inRows[1], inRows[3], outRows[1], outRows[3]);
1609 zip16<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1610 zip16<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1611 // correction steps follow below (if required)
1612}
1613
1614template <typename T, size_t SIMD_WIDTH>
1615static SIMD_INLINE void transpose2inplcLane(
1616 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1617 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<2>, Bytes<16>)
1618{
1619 zip16<1>(inRows[0], inRows[1], outRows[0], outRows[1]);
1620 // correction steps follow below (if required)
1621}
1622
1623template <typename T, size_t SIMD_WIDTH>
1624static SIMD_INLINE void transpose2inplcLane(
1625 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1626 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<32>,
1627 Bytes<32>)
1628{
1629 zip16<1>(inRows[0], inRows[16], outRows[0], outRows[16]);
1630 zip16<1>(inRows[1], inRows[17], outRows[1], outRows[17]);
1631 zip16<1>(inRows[2], inRows[18], outRows[2], outRows[18]);
1632 zip16<1>(inRows[3], inRows[19], outRows[3], outRows[19]);
1633 zip16<1>(inRows[4], inRows[20], outRows[4], outRows[20]);
1634 zip16<1>(inRows[5], inRows[21], outRows[5], outRows[21]);
1635 zip16<1>(inRows[6], inRows[22], outRows[6], outRows[22]);
1636 zip16<1>(inRows[7], inRows[23], outRows[7], outRows[23]);
1637 zip16<1>(inRows[8], inRows[24], outRows[8], outRows[24]);
1638 zip16<1>(inRows[9], inRows[25], outRows[9], outRows[25]);
1639 zip16<1>(inRows[10], inRows[26], outRows[10], outRows[26]);
1640 zip16<1>(inRows[11], inRows[27], outRows[11], outRows[27]);
1641 zip16<1>(inRows[12], inRows[28], outRows[12], outRows[28]);
1642 zip16<1>(inRows[13], inRows[29], outRows[13], outRows[29]);
1643 zip16<1>(inRows[14], inRows[30], outRows[14], outRows[30]);
1644 zip16<1>(inRows[15], inRows[31], outRows[15], outRows[31]);
1645 zip16<1>(outRows[0], outRows[8], outRows[0], outRows[8]);
1646 zip16<1>(outRows[16], outRows[24], outRows[16], outRows[24]);
1647 zip16<1>(outRows[1], outRows[9], outRows[1], outRows[9]);
1648 zip16<1>(outRows[17], outRows[25], outRows[17], outRows[25]);
1649 zip16<1>(outRows[2], outRows[10], outRows[2], outRows[10]);
1650 zip16<1>(outRows[18], outRows[26], outRows[18], outRows[26]);
1651 zip16<1>(outRows[3], outRows[11], outRows[3], outRows[11]);
1652 zip16<1>(outRows[19], outRows[27], outRows[19], outRows[27]);
1653 zip16<1>(outRows[4], outRows[12], outRows[4], outRows[12]);
1654 zip16<1>(outRows[20], outRows[28], outRows[20], outRows[28]);
1655 zip16<1>(outRows[5], outRows[13], outRows[5], outRows[13]);
1656 zip16<1>(outRows[21], outRows[29], outRows[21], outRows[29]);
1657 zip16<1>(outRows[6], outRows[14], outRows[6], outRows[14]);
1658 zip16<1>(outRows[22], outRows[30], outRows[22], outRows[30]);
1659 zip16<1>(outRows[7], outRows[15], outRows[7], outRows[15]);
1660 zip16<1>(outRows[23], outRows[31], outRows[23], outRows[31]);
1661 zip16<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1662 zip16<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1663 zip16<1>(outRows[16], outRows[20], outRows[16], outRows[20]);
1664 zip16<1>(outRows[24], outRows[28], outRows[24], outRows[28]);
1665 zip16<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1666 zip16<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1667 zip16<1>(outRows[17], outRows[21], outRows[17], outRows[21]);
1668 zip16<1>(outRows[25], outRows[29], outRows[25], outRows[29]);
1669 zip16<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1670 zip16<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1671 zip16<1>(outRows[18], outRows[22], outRows[18], outRows[22]);
1672 zip16<1>(outRows[26], outRows[30], outRows[26], outRows[30]);
1673 zip16<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
1674 zip16<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
1675 zip16<1>(outRows[19], outRows[23], outRows[19], outRows[23]);
1676 zip16<1>(outRows[27], outRows[31], outRows[27], outRows[31]);
1677 zip16<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1678 zip16<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1679 zip16<1>(outRows[8], outRows[10], outRows[8], outRows[10]);
1680 zip16<1>(outRows[12], outRows[14], outRows[12], outRows[14]);
1681 zip16<1>(outRows[16], outRows[18], outRows[16], outRows[18]);
1682 zip16<1>(outRows[20], outRows[22], outRows[20], outRows[22]);
1683 zip16<1>(outRows[24], outRows[26], outRows[24], outRows[26]);
1684 zip16<1>(outRows[28], outRows[30], outRows[28], outRows[30]);
1685 zip16<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1686 zip16<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1687 zip16<1>(outRows[9], outRows[11], outRows[9], outRows[11]);
1688 zip16<1>(outRows[13], outRows[15], outRows[13], outRows[15]);
1689 zip16<1>(outRows[17], outRows[19], outRows[17], outRows[19]);
1690 zip16<1>(outRows[21], outRows[23], outRows[21], outRows[23]);
1691 zip16<1>(outRows[25], outRows[27], outRows[25], outRows[27]);
1692 zip16<1>(outRows[29], outRows[31], outRows[29], outRows[31]);
1693 // correction steps follow below (if required)
1694 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1695 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1696 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1697 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1698 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
1699 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
1700 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
1701 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
1702 zip<1>(outRows[16], outRows[17], outRows[16], outRows[17]);
1703 zip<1>(outRows[18], outRows[19], outRows[18], outRows[19]);
1704 zip<1>(outRows[20], outRows[21], outRows[20], outRows[21]);
1705 zip<1>(outRows[22], outRows[23], outRows[22], outRows[23]);
1706 zip<1>(outRows[24], outRows[25], outRows[24], outRows[25]);
1707 zip<1>(outRows[26], outRows[27], outRows[26], outRows[27]);
1708 zip<1>(outRows[28], outRows[29], outRows[28], outRows[29]);
1709 zip<1>(outRows[30], outRows[31], outRows[30], outRows[31]);
1710 {
1711 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
1712 outRows[1] = outRows[2];
1713 outRows[2] = outRows[4];
1714 outRows[4] = outRows[8];
1715 outRows[8] = outRows[16];
1716 outRows[16] = vec_v;
1717 }
1718 {
1719 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
1720 outRows[3] = outRows[6];
1721 outRows[6] = outRows[12];
1722 outRows[12] = outRows[24];
1723 outRows[24] = outRows[17];
1724 outRows[17] = vec_v;
1725 }
1726 {
1727 Vec<T, SIMD_WIDTH> vec_v = outRows[5];
1728 outRows[5] = outRows[10];
1729 outRows[10] = outRows[20];
1730 outRows[20] = outRows[9];
1731 outRows[9] = outRows[18];
1732 outRows[18] = vec_v;
1733 }
1734 {
1735 Vec<T, SIMD_WIDTH> vec_v = outRows[7];
1736 outRows[7] = outRows[14];
1737 outRows[14] = outRows[28];
1738 outRows[28] = outRows[25];
1739 outRows[25] = outRows[19];
1740 outRows[19] = vec_v;
1741 }
1742 {
1743 Vec<T, SIMD_WIDTH> vec_v = outRows[11];
1744 outRows[11] = outRows[22];
1745 outRows[22] = outRows[13];
1746 outRows[13] = outRows[26];
1747 outRows[26] = outRows[21];
1748 outRows[21] = vec_v;
1749 }
1750 {
1751 Vec<T, SIMD_WIDTH> vec_v = outRows[15];
1752 outRows[15] = outRows[30];
1753 outRows[30] = outRows[29];
1754 outRows[29] = outRows[27];
1755 outRows[27] = outRows[23];
1756 outRows[23] = vec_v;
1757 }
1758}
1759
1760template <typename T, size_t SIMD_WIDTH>
1761static SIMD_INLINE void transpose2inplcLane(
1762 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1763 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>,
1764 Bytes<32>)
1765{
1766 zip16<1>(inRows[0], inRows[8], outRows[0], outRows[8]);
1767 zip16<1>(inRows[1], inRows[9], outRows[1], outRows[9]);
1768 zip16<1>(inRows[2], inRows[10], outRows[2], outRows[10]);
1769 zip16<1>(inRows[3], inRows[11], outRows[3], outRows[11]);
1770 zip16<1>(inRows[4], inRows[12], outRows[4], outRows[12]);
1771 zip16<1>(inRows[5], inRows[13], outRows[5], outRows[13]);
1772 zip16<1>(inRows[6], inRows[14], outRows[6], outRows[14]);
1773 zip16<1>(inRows[7], inRows[15], outRows[7], outRows[15]);
1774 zip16<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1775 zip16<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1776 zip16<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1777 zip16<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1778 zip16<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1779 zip16<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1780 zip16<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
1781 zip16<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
1782 zip16<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1783 zip16<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1784 zip16<1>(outRows[8], outRows[10], outRows[8], outRows[10]);
1785 zip16<1>(outRows[12], outRows[14], outRows[12], outRows[14]);
1786 zip16<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1787 zip16<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1788 zip16<1>(outRows[9], outRows[11], outRows[9], outRows[11]);
1789 zip16<1>(outRows[13], outRows[15], outRows[13], outRows[15]);
1790 // correction steps follow below (if required)
1791 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1792 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1793 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1794 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1795 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
1796 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
1797 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
1798 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
1799 {
1800 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
1801 outRows[1] = outRows[2];
1802 outRows[2] = outRows[4];
1803 outRows[4] = outRows[8];
1804 outRows[8] = vec_v;
1805 }
1806 {
1807 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
1808 outRows[3] = outRows[6];
1809 outRows[6] = outRows[12];
1810 outRows[12] = outRows[9];
1811 outRows[9] = vec_v;
1812 }
1813 {
1814 Vec<T, SIMD_WIDTH> vec_v = outRows[5];
1815 outRows[5] = outRows[10];
1816 outRows[10] = vec_v;
1817 }
1818 {
1819 Vec<T, SIMD_WIDTH> vec_v = outRows[7];
1820 outRows[7] = outRows[14];
1821 outRows[14] = outRows[13];
1822 outRows[13] = outRows[11];
1823 outRows[11] = vec_v;
1824 }
1825}
1826
1827template <typename T, size_t SIMD_WIDTH>
1828static SIMD_INLINE void transpose2inplcLane(
1829 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1830 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<32>)
1831{
1832 zip16<1>(inRows[0], inRows[4], outRows[0], outRows[4]);
1833 zip16<1>(inRows[1], inRows[5], outRows[1], outRows[5]);
1834 zip16<1>(inRows[2], inRows[6], outRows[2], outRows[6]);
1835 zip16<1>(inRows[3], inRows[7], outRows[3], outRows[7]);
1836 zip16<1>(outRows[0], outRows[2], outRows[0], outRows[2]);
1837 zip16<1>(outRows[4], outRows[6], outRows[4], outRows[6]);
1838 zip16<1>(outRows[1], outRows[3], outRows[1], outRows[3]);
1839 zip16<1>(outRows[5], outRows[7], outRows[5], outRows[7]);
1840 // correction steps follow below (if required)
1841 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1842 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1843 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
1844 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
1845 {
1846 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
1847 outRows[1] = outRows[2];
1848 outRows[2] = outRows[4];
1849 outRows[4] = vec_v;
1850 }
1851 {
1852 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
1853 outRows[3] = outRows[6];
1854 outRows[6] = outRows[5];
1855 outRows[5] = vec_v;
1856 }
1857}
1858
1859template <typename T, size_t SIMD_WIDTH>
1860static SIMD_INLINE void transpose2inplcLane(
1861 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1862 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<32>)
1863{
1864 zip16<1>(inRows[0], inRows[2], outRows[0], outRows[2]);
1865 zip16<1>(inRows[1], inRows[3], outRows[1], outRows[3]);
1866 // correction steps follow below (if required)
1867 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
1868 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
1869 {
1870 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
1871 outRows[1] = outRows[2];
1872 outRows[2] = vec_v;
1873 }
1874}
1875
1876template <typename T, size_t SIMD_WIDTH>
1877static SIMD_INLINE void transpose2inplcLane(
1878 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
1879 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<64>,
1880 Bytes<64>)
1881{
1882 zip16<1>(inRows[0], inRows[32], outRows[0], outRows[32]);
1883 zip16<1>(inRows[1], inRows[33], outRows[1], outRows[33]);
1884 zip16<1>(inRows[2], inRows[34], outRows[2], outRows[34]);
1885 zip16<1>(inRows[3], inRows[35], outRows[3], outRows[35]);
1886 zip16<1>(inRows[4], inRows[36], outRows[4], outRows[36]);
1887 zip16<1>(inRows[5], inRows[37], outRows[5], outRows[37]);
1888 zip16<1>(inRows[6], inRows[38], outRows[6], outRows[38]);
1889 zip16<1>(inRows[7], inRows[39], outRows[7], outRows[39]);
1890 zip16<1>(inRows[8], inRows[40], outRows[8], outRows[40]);
1891 zip16<1>(inRows[9], inRows[41], outRows[9], outRows[41]);
1892 zip16<1>(inRows[10], inRows[42], outRows[10], outRows[42]);
1893 zip16<1>(inRows[11], inRows[43], outRows[11], outRows[43]);
1894 zip16<1>(inRows[12], inRows[44], outRows[12], outRows[44]);
1895 zip16<1>(inRows[13], inRows[45], outRows[13], outRows[45]);
1896 zip16<1>(inRows[14], inRows[46], outRows[14], outRows[46]);
1897 zip16<1>(inRows[15], inRows[47], outRows[15], outRows[47]);
1898 zip16<1>(inRows[16], inRows[48], outRows[16], outRows[48]);
1899 zip16<1>(inRows[17], inRows[49], outRows[17], outRows[49]);
1900 zip16<1>(inRows[18], inRows[50], outRows[18], outRows[50]);
1901 zip16<1>(inRows[19], inRows[51], outRows[19], outRows[51]);
1902 zip16<1>(inRows[20], inRows[52], outRows[20], outRows[52]);
1903 zip16<1>(inRows[21], inRows[53], outRows[21], outRows[53]);
1904 zip16<1>(inRows[22], inRows[54], outRows[22], outRows[54]);
1905 zip16<1>(inRows[23], inRows[55], outRows[23], outRows[55]);
1906 zip16<1>(inRows[24], inRows[56], outRows[24], outRows[56]);
1907 zip16<1>(inRows[25], inRows[57], outRows[25], outRows[57]);
1908 zip16<1>(inRows[26], inRows[58], outRows[26], outRows[58]);
1909 zip16<1>(inRows[27], inRows[59], outRows[27], outRows[59]);
1910 zip16<1>(inRows[28], inRows[60], outRows[28], outRows[60]);
1911 zip16<1>(inRows[29], inRows[61], outRows[29], outRows[61]);
1912 zip16<1>(inRows[30], inRows[62], outRows[30], outRows[62]);
1913 zip16<1>(inRows[31], inRows[63], outRows[31], outRows[63]);
1914 zip16<1>(outRows[0], outRows[16], outRows[0], outRows[16]);
1915 zip16<1>(outRows[32], outRows[48], outRows[32], outRows[48]);
1916 zip16<1>(outRows[1], outRows[17], outRows[1], outRows[17]);
1917 zip16<1>(outRows[33], outRows[49], outRows[33], outRows[49]);
1918 zip16<1>(outRows[2], outRows[18], outRows[2], outRows[18]);
1919 zip16<1>(outRows[34], outRows[50], outRows[34], outRows[50]);
1920 zip16<1>(outRows[3], outRows[19], outRows[3], outRows[19]);
1921 zip16<1>(outRows[35], outRows[51], outRows[35], outRows[51]);
1922 zip16<1>(outRows[4], outRows[20], outRows[4], outRows[20]);
1923 zip16<1>(outRows[36], outRows[52], outRows[36], outRows[52]);
1924 zip16<1>(outRows[5], outRows[21], outRows[5], outRows[21]);
1925 zip16<1>(outRows[37], outRows[53], outRows[37], outRows[53]);
1926 zip16<1>(outRows[6], outRows[22], outRows[6], outRows[22]);
1927 zip16<1>(outRows[38], outRows[54], outRows[38], outRows[54]);
1928 zip16<1>(outRows[7], outRows[23], outRows[7], outRows[23]);
1929 zip16<1>(outRows[39], outRows[55], outRows[39], outRows[55]);
1930 zip16<1>(outRows[8], outRows[24], outRows[8], outRows[24]);
1931 zip16<1>(outRows[40], outRows[56], outRows[40], outRows[56]);
1932 zip16<1>(outRows[9], outRows[25], outRows[9], outRows[25]);
1933 zip16<1>(outRows[41], outRows[57], outRows[41], outRows[57]);
1934 zip16<1>(outRows[10], outRows[26], outRows[10], outRows[26]);
1935 zip16<1>(outRows[42], outRows[58], outRows[42], outRows[58]);
1936 zip16<1>(outRows[11], outRows[27], outRows[11], outRows[27]);
1937 zip16<1>(outRows[43], outRows[59], outRows[43], outRows[59]);
1938 zip16<1>(outRows[12], outRows[28], outRows[12], outRows[28]);
1939 zip16<1>(outRows[44], outRows[60], outRows[44], outRows[60]);
1940 zip16<1>(outRows[13], outRows[29], outRows[13], outRows[29]);
1941 zip16<1>(outRows[45], outRows[61], outRows[45], outRows[61]);
1942 zip16<1>(outRows[14], outRows[30], outRows[14], outRows[30]);
1943 zip16<1>(outRows[46], outRows[62], outRows[46], outRows[62]);
1944 zip16<1>(outRows[15], outRows[31], outRows[15], outRows[31]);
1945 zip16<1>(outRows[47], outRows[63], outRows[47], outRows[63]);
1946 zip16<1>(outRows[0], outRows[8], outRows[0], outRows[8]);
1947 zip16<1>(outRows[16], outRows[24], outRows[16], outRows[24]);
1948 zip16<1>(outRows[32], outRows[40], outRows[32], outRows[40]);
1949 zip16<1>(outRows[48], outRows[56], outRows[48], outRows[56]);
1950 zip16<1>(outRows[1], outRows[9], outRows[1], outRows[9]);
1951 zip16<1>(outRows[17], outRows[25], outRows[17], outRows[25]);
1952 zip16<1>(outRows[33], outRows[41], outRows[33], outRows[41]);
1953 zip16<1>(outRows[49], outRows[57], outRows[49], outRows[57]);
1954 zip16<1>(outRows[2], outRows[10], outRows[2], outRows[10]);
1955 zip16<1>(outRows[18], outRows[26], outRows[18], outRows[26]);
1956 zip16<1>(outRows[34], outRows[42], outRows[34], outRows[42]);
1957 zip16<1>(outRows[50], outRows[58], outRows[50], outRows[58]);
1958 zip16<1>(outRows[3], outRows[11], outRows[3], outRows[11]);
1959 zip16<1>(outRows[19], outRows[27], outRows[19], outRows[27]);
1960 zip16<1>(outRows[35], outRows[43], outRows[35], outRows[43]);
1961 zip16<1>(outRows[51], outRows[59], outRows[51], outRows[59]);
1962 zip16<1>(outRows[4], outRows[12], outRows[4], outRows[12]);
1963 zip16<1>(outRows[20], outRows[28], outRows[20], outRows[28]);
1964 zip16<1>(outRows[36], outRows[44], outRows[36], outRows[44]);
1965 zip16<1>(outRows[52], outRows[60], outRows[52], outRows[60]);
1966 zip16<1>(outRows[5], outRows[13], outRows[5], outRows[13]);
1967 zip16<1>(outRows[21], outRows[29], outRows[21], outRows[29]);
1968 zip16<1>(outRows[37], outRows[45], outRows[37], outRows[45]);
1969 zip16<1>(outRows[53], outRows[61], outRows[53], outRows[61]);
1970 zip16<1>(outRows[6], outRows[14], outRows[6], outRows[14]);
1971 zip16<1>(outRows[22], outRows[30], outRows[22], outRows[30]);
1972 zip16<1>(outRows[38], outRows[46], outRows[38], outRows[46]);
1973 zip16<1>(outRows[54], outRows[62], outRows[54], outRows[62]);
1974 zip16<1>(outRows[7], outRows[15], outRows[7], outRows[15]);
1975 zip16<1>(outRows[23], outRows[31], outRows[23], outRows[31]);
1976 zip16<1>(outRows[39], outRows[47], outRows[39], outRows[47]);
1977 zip16<1>(outRows[55], outRows[63], outRows[55], outRows[63]);
1978 zip16<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
1979 zip16<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
1980 zip16<1>(outRows[16], outRows[20], outRows[16], outRows[20]);
1981 zip16<1>(outRows[24], outRows[28], outRows[24], outRows[28]);
1982 zip16<1>(outRows[32], outRows[36], outRows[32], outRows[36]);
1983 zip16<1>(outRows[40], outRows[44], outRows[40], outRows[44]);
1984 zip16<1>(outRows[48], outRows[52], outRows[48], outRows[52]);
1985 zip16<1>(outRows[56], outRows[60], outRows[56], outRows[60]);
1986 zip16<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
1987 zip16<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
1988 zip16<1>(outRows[17], outRows[21], outRows[17], outRows[21]);
1989 zip16<1>(outRows[25], outRows[29], outRows[25], outRows[29]);
1990 zip16<1>(outRows[33], outRows[37], outRows[33], outRows[37]);
1991 zip16<1>(outRows[41], outRows[45], outRows[41], outRows[45]);
1992 zip16<1>(outRows[49], outRows[53], outRows[49], outRows[53]);
1993 zip16<1>(outRows[57], outRows[61], outRows[57], outRows[61]);
1994 zip16<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
1995 zip16<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
1996 zip16<1>(outRows[18], outRows[22], outRows[18], outRows[22]);
1997 zip16<1>(outRows[26], outRows[30], outRows[26], outRows[30]);
1998 zip16<1>(outRows[34], outRows[38], outRows[34], outRows[38]);
1999 zip16<1>(outRows[42], outRows[46], outRows[42], outRows[46]);
2000 zip16<1>(outRows[50], outRows[54], outRows[50], outRows[54]);
2001 zip16<1>(outRows[58], outRows[62], outRows[58], outRows[62]);
2002 zip16<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
2003 zip16<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
2004 zip16<1>(outRows[19], outRows[23], outRows[19], outRows[23]);
2005 zip16<1>(outRows[27], outRows[31], outRows[27], outRows[31]);
2006 zip16<1>(outRows[35], outRows[39], outRows[35], outRows[39]);
2007 zip16<1>(outRows[43], outRows[47], outRows[43], outRows[47]);
2008 zip16<1>(outRows[51], outRows[55], outRows[51], outRows[55]);
2009 zip16<1>(outRows[59], outRows[63], outRows[59], outRows[63]);
2010 // correction steps follow below (if required)
2011 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
2012 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
2013 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
2014 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
2015 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
2016 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
2017 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
2018 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
2019 zip<1>(outRows[16], outRows[17], outRows[16], outRows[17]);
2020 zip<1>(outRows[18], outRows[19], outRows[18], outRows[19]);
2021 zip<1>(outRows[20], outRows[21], outRows[20], outRows[21]);
2022 zip<1>(outRows[22], outRows[23], outRows[22], outRows[23]);
2023 zip<1>(outRows[24], outRows[25], outRows[24], outRows[25]);
2024 zip<1>(outRows[26], outRows[27], outRows[26], outRows[27]);
2025 zip<1>(outRows[28], outRows[29], outRows[28], outRows[29]);
2026 zip<1>(outRows[30], outRows[31], outRows[30], outRows[31]);
2027 zip<1>(outRows[32], outRows[33], outRows[32], outRows[33]);
2028 zip<1>(outRows[34], outRows[35], outRows[34], outRows[35]);
2029 zip<1>(outRows[36], outRows[37], outRows[36], outRows[37]);
2030 zip<1>(outRows[38], outRows[39], outRows[38], outRows[39]);
2031 zip<1>(outRows[40], outRows[41], outRows[40], outRows[41]);
2032 zip<1>(outRows[42], outRows[43], outRows[42], outRows[43]);
2033 zip<1>(outRows[44], outRows[45], outRows[44], outRows[45]);
2034 zip<1>(outRows[46], outRows[47], outRows[46], outRows[47]);
2035 zip<1>(outRows[48], outRows[49], outRows[48], outRows[49]);
2036 zip<1>(outRows[50], outRows[51], outRows[50], outRows[51]);
2037 zip<1>(outRows[52], outRows[53], outRows[52], outRows[53]);
2038 zip<1>(outRows[54], outRows[55], outRows[54], outRows[55]);
2039 zip<1>(outRows[56], outRows[57], outRows[56], outRows[57]);
2040 zip<1>(outRows[58], outRows[59], outRows[58], outRows[59]);
2041 zip<1>(outRows[60], outRows[61], outRows[60], outRows[61]);
2042 zip<1>(outRows[62], outRows[63], outRows[62], outRows[63]);
2043 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
2044 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
2045 zip<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
2046 zip<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
2047 zip<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
2048 zip<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
2049 zip<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
2050 zip<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
2051 zip<2>(outRows[32], outRows[34], outRows[32], outRows[34]);
2052 zip<2>(outRows[36], outRows[38], outRows[36], outRows[38]);
2053 zip<2>(outRows[40], outRows[42], outRows[40], outRows[42]);
2054 zip<2>(outRows[44], outRows[46], outRows[44], outRows[46]);
2055 zip<2>(outRows[48], outRows[50], outRows[48], outRows[50]);
2056 zip<2>(outRows[52], outRows[54], outRows[52], outRows[54]);
2057 zip<2>(outRows[56], outRows[58], outRows[56], outRows[58]);
2058 zip<2>(outRows[60], outRows[62], outRows[60], outRows[62]);
2059 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
2060 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
2061 zip<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
2062 zip<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
2063 zip<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
2064 zip<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
2065 zip<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
2066 zip<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
2067 zip<2>(outRows[33], outRows[35], outRows[33], outRows[35]);
2068 zip<2>(outRows[37], outRows[39], outRows[37], outRows[39]);
2069 zip<2>(outRows[41], outRows[43], outRows[41], outRows[43]);
2070 zip<2>(outRows[45], outRows[47], outRows[45], outRows[47]);
2071 zip<2>(outRows[49], outRows[51], outRows[49], outRows[51]);
2072 zip<2>(outRows[53], outRows[55], outRows[53], outRows[55]);
2073 zip<2>(outRows[57], outRows[59], outRows[57], outRows[59]);
2074 zip<2>(outRows[61], outRows[63], outRows[61], outRows[63]);
2075 {
2076 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
2077 outRows[1] = outRows[4];
2078 outRows[4] = outRows[16];
2079 outRows[16] = outRows[2];
2080 outRows[2] = outRows[8];
2081 outRows[8] = outRows[32];
2082 outRows[32] = vec_v;
2083 }
2084 {
2085 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
2086 outRows[3] = outRows[12];
2087 outRows[12] = outRows[48];
2088 outRows[48] = vec_v;
2089 }
2090 {
2091 Vec<T, SIMD_WIDTH> vec_v = outRows[5];
2092 outRows[5] = outRows[20];
2093 outRows[20] = outRows[18];
2094 outRows[18] = outRows[10];
2095 outRows[10] = outRows[40];
2096 outRows[40] = outRows[33];
2097 outRows[33] = vec_v;
2098 }
2099 {
2100 Vec<T, SIMD_WIDTH> vec_v = outRows[6];
2101 outRows[6] = outRows[24];
2102 outRows[24] = outRows[34];
2103 outRows[34] = outRows[9];
2104 outRows[9] = outRows[36];
2105 outRows[36] = outRows[17];
2106 outRows[17] = vec_v;
2107 }
2108 {
2109 Vec<T, SIMD_WIDTH> vec_v = outRows[7];
2110 outRows[7] = outRows[28];
2111 outRows[28] = outRows[50];
2112 outRows[50] = outRows[11];
2113 outRows[11] = outRows[44];
2114 outRows[44] = outRows[49];
2115 outRows[49] = vec_v;
2116 }
2117 {
2118 Vec<T, SIMD_WIDTH> vec_v = outRows[13];
2119 outRows[13] = outRows[52];
2120 outRows[52] = outRows[19];
2121 outRows[19] = outRows[14];
2122 outRows[14] = outRows[56];
2123 outRows[56] = outRows[35];
2124 outRows[35] = vec_v;
2125 }
2126 {
2127 Vec<T, SIMD_WIDTH> vec_v = outRows[15];
2128 outRows[15] = outRows[60];
2129 outRows[60] = outRows[51];
2130 outRows[51] = vec_v;
2131 }
2132 {
2133 Vec<T, SIMD_WIDTH> vec_v = outRows[21];
2134 outRows[21] = outRows[22];
2135 outRows[22] = outRows[26];
2136 outRows[26] = outRows[42];
2137 outRows[42] = outRows[41];
2138 outRows[41] = outRows[37];
2139 outRows[37] = vec_v;
2140 }
2141 {
2142 Vec<T, SIMD_WIDTH> vec_v = outRows[23];
2143 outRows[23] = outRows[30];
2144 outRows[30] = outRows[58];
2145 outRows[58] = outRows[43];
2146 outRows[43] = outRows[45];
2147 outRows[45] = outRows[53];
2148 outRows[53] = vec_v;
2149 }
2150 {
2151 Vec<T, SIMD_WIDTH> vec_v = outRows[25];
2152 outRows[25] = outRows[38];
2153 outRows[38] = vec_v;
2154 }
2155 {
2156 Vec<T, SIMD_WIDTH> vec_v = outRows[27];
2157 outRows[27] = outRows[46];
2158 outRows[46] = outRows[57];
2159 outRows[57] = outRows[39];
2160 outRows[39] = outRows[29];
2161 outRows[29] = outRows[54];
2162 outRows[54] = vec_v;
2163 }
2164 {
2165 Vec<T, SIMD_WIDTH> vec_v = outRows[31];
2166 outRows[31] = outRows[62];
2167 outRows[62] = outRows[59];
2168 outRows[59] = outRows[47];
2169 outRows[47] = outRows[61];
2170 outRows[61] = outRows[55];
2171 outRows[55] = vec_v;
2172 }
2173}
2174
2175template <typename T, size_t SIMD_WIDTH>
2176static SIMD_INLINE void transpose2inplcLane(
2177 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2178 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<32>,
2179 Bytes<64>)
2180{
2181 zip16<1>(inRows[0], inRows[16], outRows[0], outRows[16]);
2182 zip16<1>(inRows[1], inRows[17], outRows[1], outRows[17]);
2183 zip16<1>(inRows[2], inRows[18], outRows[2], outRows[18]);
2184 zip16<1>(inRows[3], inRows[19], outRows[3], outRows[19]);
2185 zip16<1>(inRows[4], inRows[20], outRows[4], outRows[20]);
2186 zip16<1>(inRows[5], inRows[21], outRows[5], outRows[21]);
2187 zip16<1>(inRows[6], inRows[22], outRows[6], outRows[22]);
2188 zip16<1>(inRows[7], inRows[23], outRows[7], outRows[23]);
2189 zip16<1>(inRows[8], inRows[24], outRows[8], outRows[24]);
2190 zip16<1>(inRows[9], inRows[25], outRows[9], outRows[25]);
2191 zip16<1>(inRows[10], inRows[26], outRows[10], outRows[26]);
2192 zip16<1>(inRows[11], inRows[27], outRows[11], outRows[27]);
2193 zip16<1>(inRows[12], inRows[28], outRows[12], outRows[28]);
2194 zip16<1>(inRows[13], inRows[29], outRows[13], outRows[29]);
2195 zip16<1>(inRows[14], inRows[30], outRows[14], outRows[30]);
2196 zip16<1>(inRows[15], inRows[31], outRows[15], outRows[31]);
2197 zip16<1>(outRows[0], outRows[8], outRows[0], outRows[8]);
2198 zip16<1>(outRows[16], outRows[24], outRows[16], outRows[24]);
2199 zip16<1>(outRows[1], outRows[9], outRows[1], outRows[9]);
2200 zip16<1>(outRows[17], outRows[25], outRows[17], outRows[25]);
2201 zip16<1>(outRows[2], outRows[10], outRows[2], outRows[10]);
2202 zip16<1>(outRows[18], outRows[26], outRows[18], outRows[26]);
2203 zip16<1>(outRows[3], outRows[11], outRows[3], outRows[11]);
2204 zip16<1>(outRows[19], outRows[27], outRows[19], outRows[27]);
2205 zip16<1>(outRows[4], outRows[12], outRows[4], outRows[12]);
2206 zip16<1>(outRows[20], outRows[28], outRows[20], outRows[28]);
2207 zip16<1>(outRows[5], outRows[13], outRows[5], outRows[13]);
2208 zip16<1>(outRows[21], outRows[29], outRows[21], outRows[29]);
2209 zip16<1>(outRows[6], outRows[14], outRows[6], outRows[14]);
2210 zip16<1>(outRows[22], outRows[30], outRows[22], outRows[30]);
2211 zip16<1>(outRows[7], outRows[15], outRows[7], outRows[15]);
2212 zip16<1>(outRows[23], outRows[31], outRows[23], outRows[31]);
2213 zip16<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
2214 zip16<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
2215 zip16<1>(outRows[16], outRows[20], outRows[16], outRows[20]);
2216 zip16<1>(outRows[24], outRows[28], outRows[24], outRows[28]);
2217 zip16<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
2218 zip16<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
2219 zip16<1>(outRows[17], outRows[21], outRows[17], outRows[21]);
2220 zip16<1>(outRows[25], outRows[29], outRows[25], outRows[29]);
2221 zip16<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
2222 zip16<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
2223 zip16<1>(outRows[18], outRows[22], outRows[18], outRows[22]);
2224 zip16<1>(outRows[26], outRows[30], outRows[26], outRows[30]);
2225 zip16<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
2226 zip16<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
2227 zip16<1>(outRows[19], outRows[23], outRows[19], outRows[23]);
2228 zip16<1>(outRows[27], outRows[31], outRows[27], outRows[31]);
2229 // correction steps follow below (if required)
2230 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
2231 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
2232 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
2233 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
2234 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
2235 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
2236 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
2237 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
2238 zip<1>(outRows[16], outRows[17], outRows[16], outRows[17]);
2239 zip<1>(outRows[18], outRows[19], outRows[18], outRows[19]);
2240 zip<1>(outRows[20], outRows[21], outRows[20], outRows[21]);
2241 zip<1>(outRows[22], outRows[23], outRows[22], outRows[23]);
2242 zip<1>(outRows[24], outRows[25], outRows[24], outRows[25]);
2243 zip<1>(outRows[26], outRows[27], outRows[26], outRows[27]);
2244 zip<1>(outRows[28], outRows[29], outRows[28], outRows[29]);
2245 zip<1>(outRows[30], outRows[31], outRows[30], outRows[31]);
2246 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
2247 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
2248 zip<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
2249 zip<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
2250 zip<2>(outRows[16], outRows[18], outRows[16], outRows[18]);
2251 zip<2>(outRows[20], outRows[22], outRows[20], outRows[22]);
2252 zip<2>(outRows[24], outRows[26], outRows[24], outRows[26]);
2253 zip<2>(outRows[28], outRows[30], outRows[28], outRows[30]);
2254 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
2255 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
2256 zip<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
2257 zip<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
2258 zip<2>(outRows[17], outRows[19], outRows[17], outRows[19]);
2259 zip<2>(outRows[21], outRows[23], outRows[21], outRows[23]);
2260 zip<2>(outRows[25], outRows[27], outRows[25], outRows[27]);
2261 zip<2>(outRows[29], outRows[31], outRows[29], outRows[31]);
2262 {
2263 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
2264 outRows[1] = outRows[4];
2265 outRows[4] = outRows[16];
2266 outRows[16] = vec_v;
2267 }
2268 {
2269 Vec<T, SIMD_WIDTH> vec_v = outRows[2];
2270 outRows[2] = outRows[8];
2271 outRows[8] = vec_v;
2272 }
2273 {
2274 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
2275 outRows[3] = outRows[12];
2276 outRows[12] = outRows[18];
2277 outRows[18] = outRows[9];
2278 outRows[9] = outRows[6];
2279 outRows[6] = outRows[24];
2280 outRows[24] = vec_v;
2281 }
2282 {
2283 Vec<T, SIMD_WIDTH> vec_v = outRows[5];
2284 outRows[5] = outRows[20];
2285 outRows[20] = outRows[17];
2286 outRows[17] = vec_v;
2287 }
2288 {
2289 Vec<T, SIMD_WIDTH> vec_v = outRows[7];
2290 outRows[7] = outRows[28];
2291 outRows[28] = outRows[19];
2292 outRows[19] = outRows[13];
2293 outRows[13] = outRows[22];
2294 outRows[22] = outRows[25];
2295 outRows[25] = vec_v;
2296 }
2297 {
2298 Vec<T, SIMD_WIDTH> vec_v = outRows[11];
2299 outRows[11] = outRows[14];
2300 outRows[14] = outRows[26];
2301 outRows[26] = vec_v;
2302 }
2303 {
2304 Vec<T, SIMD_WIDTH> vec_v = outRows[15];
2305 outRows[15] = outRows[30];
2306 outRows[30] = outRows[27];
2307 outRows[27] = vec_v;
2308 }
2309 {
2310 Vec<T, SIMD_WIDTH> vec_v = outRows[23];
2311 outRows[23] = outRows[29];
2312 outRows[29] = vec_v;
2313 }
2314}
2315
2316template <typename T, size_t SIMD_WIDTH>
2317static SIMD_INLINE void transpose2inplcLane(
2318 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2319 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<16>,
2320 Bytes<64>)
2321{
2322 zip16<1>(inRows[0], inRows[8], outRows[0], outRows[8]);
2323 zip16<1>(inRows[1], inRows[9], outRows[1], outRows[9]);
2324 zip16<1>(inRows[2], inRows[10], outRows[2], outRows[10]);
2325 zip16<1>(inRows[3], inRows[11], outRows[3], outRows[11]);
2326 zip16<1>(inRows[4], inRows[12], outRows[4], outRows[12]);
2327 zip16<1>(inRows[5], inRows[13], outRows[5], outRows[13]);
2328 zip16<1>(inRows[6], inRows[14], outRows[6], outRows[14]);
2329 zip16<1>(inRows[7], inRows[15], outRows[7], outRows[15]);
2330 zip16<1>(outRows[0], outRows[4], outRows[0], outRows[4]);
2331 zip16<1>(outRows[8], outRows[12], outRows[8], outRows[12]);
2332 zip16<1>(outRows[1], outRows[5], outRows[1], outRows[5]);
2333 zip16<1>(outRows[9], outRows[13], outRows[9], outRows[13]);
2334 zip16<1>(outRows[2], outRows[6], outRows[2], outRows[6]);
2335 zip16<1>(outRows[10], outRows[14], outRows[10], outRows[14]);
2336 zip16<1>(outRows[3], outRows[7], outRows[3], outRows[7]);
2337 zip16<1>(outRows[11], outRows[15], outRows[11], outRows[15]);
2338 // correction steps follow below (if required)
2339 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
2340 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
2341 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
2342 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
2343 zip<1>(outRows[8], outRows[9], outRows[8], outRows[9]);
2344 zip<1>(outRows[10], outRows[11], outRows[10], outRows[11]);
2345 zip<1>(outRows[12], outRows[13], outRows[12], outRows[13]);
2346 zip<1>(outRows[14], outRows[15], outRows[14], outRows[15]);
2347 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
2348 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
2349 zip<2>(outRows[8], outRows[10], outRows[8], outRows[10]);
2350 zip<2>(outRows[12], outRows[14], outRows[12], outRows[14]);
2351 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
2352 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
2353 zip<2>(outRows[9], outRows[11], outRows[9], outRows[11]);
2354 zip<2>(outRows[13], outRows[15], outRows[13], outRows[15]);
2355 {
2356 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
2357 outRows[1] = outRows[4];
2358 outRows[4] = outRows[2];
2359 outRows[2] = outRows[8];
2360 outRows[8] = vec_v;
2361 }
2362 {
2363 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
2364 outRows[3] = outRows[12];
2365 outRows[12] = vec_v;
2366 }
2367 {
2368 Vec<T, SIMD_WIDTH> vec_v = outRows[5];
2369 outRows[5] = outRows[6];
2370 outRows[6] = outRows[10];
2371 outRows[10] = outRows[9];
2372 outRows[9] = vec_v;
2373 }
2374 {
2375 Vec<T, SIMD_WIDTH> vec_v = outRows[7];
2376 outRows[7] = outRows[14];
2377 outRows[14] = outRows[11];
2378 outRows[11] = outRows[13];
2379 outRows[13] = vec_v;
2380 }
2381}
2382
2383template <typename T, size_t SIMD_WIDTH>
2384static SIMD_INLINE void transpose2inplcLane(
2385 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2386 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<64>)
2387{
2388 zip16<1>(inRows[0], inRows[4], outRows[0], outRows[4]);
2389 zip16<1>(inRows[1], inRows[5], outRows[1], outRows[5]);
2390 zip16<1>(inRows[2], inRows[6], outRows[2], outRows[6]);
2391 zip16<1>(inRows[3], inRows[7], outRows[3], outRows[7]);
2392 // correction steps follow below (if required)
2393 zip<1>(outRows[0], outRows[1], outRows[0], outRows[1]);
2394 zip<1>(outRows[2], outRows[3], outRows[2], outRows[3]);
2395 zip<1>(outRows[4], outRows[5], outRows[4], outRows[5]);
2396 zip<1>(outRows[6], outRows[7], outRows[6], outRows[7]);
2397 zip<2>(outRows[0], outRows[2], outRows[0], outRows[2]);
2398 zip<2>(outRows[4], outRows[6], outRows[4], outRows[6]);
2399 zip<2>(outRows[1], outRows[3], outRows[1], outRows[3]);
2400 zip<2>(outRows[5], outRows[7], outRows[5], outRows[7]);
2401 {
2402 Vec<T, SIMD_WIDTH> vec_v = outRows[1];
2403 outRows[1] = outRows[4];
2404 outRows[4] = vec_v;
2405 }
2406 {
2407 Vec<T, SIMD_WIDTH> vec_v = outRows[3];
2408 outRows[3] = outRows[6];
2409 outRows[6] = vec_v;
2410 }
2411}
2412
2413template <typename T, size_t SIMD_WIDTH>
2414static SIMD_INLINE void transpose2inplcLane(
2415 const Vec<T, SIMD_WIDTH> inRows[Vec<T, SIMD_WIDTH>::elems],
2416 Vec<T, SIMD_WIDTH> outRows[Vec<T, SIMD_WIDTH>::elems])
2417{
2418 transpose2inplcLane(inRows, outRows, Elements<Vec<T, SIMD_WIDTH>::elements>(),
2419 Bytes<SIMD_WIDTH>());
2420}
2421
2422// ==========================================================
2423// transpose1inplc (1-argument version)
2424// ==========================================================
2425
2426template <typename T, size_t SIMD_WIDTH>
2427static SIMD_INLINE void transpose1inplc(
2428 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<2>)
2429{
2430 zip<1>(rows[0], rows[1], rows[0], rows[1]);
2431}
2432
2433template <typename T, size_t SIMD_WIDTH>
2434static SIMD_INLINE void transpose1inplc(
2435 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<4>)
2436{
2437 zip<1>(rows[0], rows[1], rows[0], rows[1]);
2438 zip<1>(rows[2], rows[3], rows[2], rows[3]);
2439 zip<2>(rows[0], rows[2], rows[0], rows[2]);
2440 zip<2>(rows[1], rows[3], rows[1], rows[3]);
2441 std::swap(rows[1], rows[2]);
2442}
2443
2444template <typename T, size_t SIMD_WIDTH>
2445static SIMD_INLINE void transpose1inplc(
2446 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>)
2447{
2448 zip<1>(rows[0], rows[1], rows[0], rows[1]);
2449 zip<1>(rows[2], rows[3], rows[2], rows[3]);
2450 zip<1>(rows[4], rows[5], rows[4], rows[5]);
2451 zip<1>(rows[6], rows[7], rows[6], rows[7]);
2452 zip<2>(rows[0], rows[2], rows[0], rows[2]);
2453 zip<2>(rows[1], rows[3], rows[1], rows[3]);
2454 zip<2>(rows[4], rows[6], rows[4], rows[6]);
2455 zip<2>(rows[5], rows[7], rows[5], rows[7]);
2456 zip<4>(rows[0], rows[4], rows[0], rows[4]);
2457 zip<4>(rows[2], rows[6], rows[2], rows[6]);
2458 zip<4>(rows[1], rows[5], rows[1], rows[5]);
2459 zip<4>(rows[3], rows[7], rows[3], rows[7]);
2460 std::swap(rows[1], rows[4]);
2461 std::swap(rows[3], rows[6]);
2462}
2463
2464template <typename T, size_t SIMD_WIDTH>
2465static SIMD_INLINE void transpose1inplc(
2466 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>)
2467{
2468 zip<1>(rows[0], rows[1], rows[0], rows[1]);
2469 zip<1>(rows[2], rows[3], rows[2], rows[3]);
2470 zip<1>(rows[4], rows[5], rows[4], rows[5]);
2471 zip<1>(rows[6], rows[7], rows[6], rows[7]);
2472 zip<1>(rows[8], rows[9], rows[8], rows[9]);
2473 zip<1>(rows[10], rows[11], rows[10], rows[11]);
2474 zip<1>(rows[12], rows[13], rows[12], rows[13]);
2475 zip<1>(rows[14], rows[15], rows[14], rows[15]);
2476 zip<2>(rows[0], rows[2], rows[0], rows[2]);
2477 zip<2>(rows[1], rows[3], rows[1], rows[3]);
2478 zip<2>(rows[4], rows[6], rows[4], rows[6]);
2479 zip<2>(rows[5], rows[7], rows[5], rows[7]);
2480 zip<2>(rows[8], rows[10], rows[8], rows[10]);
2481 zip<2>(rows[9], rows[11], rows[9], rows[11]);
2482 zip<2>(rows[12], rows[14], rows[12], rows[14]);
2483 zip<2>(rows[13], rows[15], rows[13], rows[15]);
2484 zip<4>(rows[0], rows[4], rows[0], rows[4]);
2485 zip<4>(rows[2], rows[6], rows[2], rows[6]);
2486 zip<4>(rows[1], rows[5], rows[1], rows[5]);
2487 zip<4>(rows[3], rows[7], rows[3], rows[7]);
2488 zip<4>(rows[8], rows[12], rows[8], rows[12]);
2489 zip<4>(rows[10], rows[14], rows[10], rows[14]);
2490 zip<4>(rows[9], rows[13], rows[9], rows[13]);
2491 zip<4>(rows[11], rows[15], rows[11], rows[15]);
2492 zip<8>(rows[0], rows[8], rows[0], rows[8]);
2493 zip<8>(rows[4], rows[12], rows[4], rows[12]);
2494 zip<8>(rows[2], rows[10], rows[2], rows[10]);
2495 zip<8>(rows[6], rows[14], rows[6], rows[14]);
2496 zip<8>(rows[1], rows[9], rows[1], rows[9]);
2497 zip<8>(rows[5], rows[13], rows[5], rows[13]);
2498 zip<8>(rows[3], rows[11], rows[3], rows[11]);
2499 zip<8>(rows[7], rows[15], rows[7], rows[15]);
2500 std::swap(rows[1], rows[8]);
2501 std::swap(rows[2], rows[4]);
2502 std::swap(rows[3], rows[12]);
2503 std::swap(rows[5], rows[10]);
2504 std::swap(rows[7], rows[14]);
2505 std::swap(rows[11], rows[13]);
2506}
2507
2508template <typename T, size_t SIMD_WIDTH>
2509static SIMD_INLINE void transpose1inplc(
2510 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<32>)
2511{
2512 zip<1>(rows[0], rows[1], rows[0], rows[1]);
2513 zip<1>(rows[2], rows[3], rows[2], rows[3]);
2514 zip<1>(rows[4], rows[5], rows[4], rows[5]);
2515 zip<1>(rows[6], rows[7], rows[6], rows[7]);
2516 zip<1>(rows[8], rows[9], rows[8], rows[9]);
2517 zip<1>(rows[10], rows[11], rows[10], rows[11]);
2518 zip<1>(rows[12], rows[13], rows[12], rows[13]);
2519 zip<1>(rows[14], rows[15], rows[14], rows[15]);
2520 zip<1>(rows[16], rows[17], rows[16], rows[17]);
2521 zip<1>(rows[18], rows[19], rows[18], rows[19]);
2522 zip<1>(rows[20], rows[21], rows[20], rows[21]);
2523 zip<1>(rows[22], rows[23], rows[22], rows[23]);
2524 zip<1>(rows[24], rows[25], rows[24], rows[25]);
2525 zip<1>(rows[26], rows[27], rows[26], rows[27]);
2526 zip<1>(rows[28], rows[29], rows[28], rows[29]);
2527 zip<1>(rows[30], rows[31], rows[30], rows[31]);
2528 zip<2>(rows[0], rows[2], rows[0], rows[2]);
2529 zip<2>(rows[1], rows[3], rows[1], rows[3]);
2530 zip<2>(rows[4], rows[6], rows[4], rows[6]);
2531 zip<2>(rows[5], rows[7], rows[5], rows[7]);
2532 zip<2>(rows[8], rows[10], rows[8], rows[10]);
2533 zip<2>(rows[9], rows[11], rows[9], rows[11]);
2534 zip<2>(rows[12], rows[14], rows[12], rows[14]);
2535 zip<2>(rows[13], rows[15], rows[13], rows[15]);
2536 zip<2>(rows[16], rows[18], rows[16], rows[18]);
2537 zip<2>(rows[17], rows[19], rows[17], rows[19]);
2538 zip<2>(rows[20], rows[22], rows[20], rows[22]);
2539 zip<2>(rows[21], rows[23], rows[21], rows[23]);
2540 zip<2>(rows[24], rows[26], rows[24], rows[26]);
2541 zip<2>(rows[25], rows[27], rows[25], rows[27]);
2542 zip<2>(rows[28], rows[30], rows[28], rows[30]);
2543 zip<2>(rows[29], rows[31], rows[29], rows[31]);
2544 zip<4>(rows[0], rows[4], rows[0], rows[4]);
2545 zip<4>(rows[2], rows[6], rows[2], rows[6]);
2546 zip<4>(rows[1], rows[5], rows[1], rows[5]);
2547 zip<4>(rows[3], rows[7], rows[3], rows[7]);
2548 zip<4>(rows[8], rows[12], rows[8], rows[12]);
2549 zip<4>(rows[10], rows[14], rows[10], rows[14]);
2550 zip<4>(rows[9], rows[13], rows[9], rows[13]);
2551 zip<4>(rows[11], rows[15], rows[11], rows[15]);
2552 zip<4>(rows[16], rows[20], rows[16], rows[20]);
2553 zip<4>(rows[18], rows[22], rows[18], rows[22]);
2554 zip<4>(rows[17], rows[21], rows[17], rows[21]);
2555 zip<4>(rows[19], rows[23], rows[19], rows[23]);
2556 zip<4>(rows[24], rows[28], rows[24], rows[28]);
2557 zip<4>(rows[26], rows[30], rows[26], rows[30]);
2558 zip<4>(rows[25], rows[29], rows[25], rows[29]);
2559 zip<4>(rows[27], rows[31], rows[27], rows[31]);
2560 zip<8>(rows[0], rows[8], rows[0], rows[8]);
2561 zip<8>(rows[4], rows[12], rows[4], rows[12]);
2562 zip<8>(rows[2], rows[10], rows[2], rows[10]);
2563 zip<8>(rows[6], rows[14], rows[6], rows[14]);
2564 zip<8>(rows[1], rows[9], rows[1], rows[9]);
2565 zip<8>(rows[5], rows[13], rows[5], rows[13]);
2566 zip<8>(rows[3], rows[11], rows[3], rows[11]);
2567 zip<8>(rows[7], rows[15], rows[7], rows[15]);
2568 zip<8>(rows[16], rows[24], rows[16], rows[24]);
2569 zip<8>(rows[20], rows[28], rows[20], rows[28]);
2570 zip<8>(rows[18], rows[26], rows[18], rows[26]);
2571 zip<8>(rows[22], rows[30], rows[22], rows[30]);
2572 zip<8>(rows[17], rows[25], rows[17], rows[25]);
2573 zip<8>(rows[21], rows[29], rows[21], rows[29]);
2574 zip<8>(rows[19], rows[27], rows[19], rows[27]);
2575 zip<8>(rows[23], rows[31], rows[23], rows[31]);
2576 zip<16>(rows[0], rows[16], rows[0], rows[16]);
2577 zip<16>(rows[8], rows[24], rows[8], rows[24]);
2578 zip<16>(rows[4], rows[20], rows[4], rows[20]);
2579 zip<16>(rows[12], rows[28], rows[12], rows[28]);
2580 zip<16>(rows[2], rows[18], rows[2], rows[18]);
2581 zip<16>(rows[10], rows[26], rows[10], rows[26]);
2582 zip<16>(rows[6], rows[22], rows[6], rows[22]);
2583 zip<16>(rows[14], rows[30], rows[14], rows[30]);
2584 zip<16>(rows[1], rows[17], rows[1], rows[17]);
2585 zip<16>(rows[9], rows[25], rows[9], rows[25]);
2586 zip<16>(rows[5], rows[21], rows[5], rows[21]);
2587 zip<16>(rows[13], rows[29], rows[13], rows[29]);
2588 zip<16>(rows[3], rows[19], rows[3], rows[19]);
2589 zip<16>(rows[11], rows[27], rows[11], rows[27]);
2590 zip<16>(rows[7], rows[23], rows[7], rows[23]);
2591 zip<16>(rows[15], rows[31], rows[15], rows[31]);
2592 std::swap(rows[1], rows[16]);
2593 std::swap(rows[2], rows[8]);
2594 std::swap(rows[3], rows[24]);
2595 std::swap(rows[5], rows[20]);
2596 std::swap(rows[6], rows[12]);
2597 std::swap(rows[7], rows[28]);
2598 std::swap(rows[9], rows[18]);
2599 std::swap(rows[11], rows[26]);
2600 std::swap(rows[13], rows[22]);
2601 std::swap(rows[15], rows[30]);
2602 std::swap(rows[19], rows[25]);
2603 std::swap(rows[23], rows[29]);
2604}
2605
2606template <typename T, size_t SIMD_WIDTH>
2607static SIMD_INLINE void transpose1inplc(
2608 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<64>)
2609{
2610 zip<1>(rows[0], rows[1], rows[0], rows[1]);
2611 zip<1>(rows[2], rows[3], rows[2], rows[3]);
2612 zip<1>(rows[4], rows[5], rows[4], rows[5]);
2613 zip<1>(rows[6], rows[7], rows[6], rows[7]);
2614 zip<1>(rows[8], rows[9], rows[8], rows[9]);
2615 zip<1>(rows[10], rows[11], rows[10], rows[11]);
2616 zip<1>(rows[12], rows[13], rows[12], rows[13]);
2617 zip<1>(rows[14], rows[15], rows[14], rows[15]);
2618 zip<1>(rows[16], rows[17], rows[16], rows[17]);
2619 zip<1>(rows[18], rows[19], rows[18], rows[19]);
2620 zip<1>(rows[20], rows[21], rows[20], rows[21]);
2621 zip<1>(rows[22], rows[23], rows[22], rows[23]);
2622 zip<1>(rows[24], rows[25], rows[24], rows[25]);
2623 zip<1>(rows[26], rows[27], rows[26], rows[27]);
2624 zip<1>(rows[28], rows[29], rows[28], rows[29]);
2625 zip<1>(rows[30], rows[31], rows[30], rows[31]);
2626 zip<1>(rows[32], rows[33], rows[32], rows[33]);
2627 zip<1>(rows[34], rows[35], rows[34], rows[35]);
2628 zip<1>(rows[36], rows[37], rows[36], rows[37]);
2629 zip<1>(rows[38], rows[39], rows[38], rows[39]);
2630 zip<1>(rows[40], rows[41], rows[40], rows[41]);
2631 zip<1>(rows[42], rows[43], rows[42], rows[43]);
2632 zip<1>(rows[44], rows[45], rows[44], rows[45]);
2633 zip<1>(rows[46], rows[47], rows[46], rows[47]);
2634 zip<1>(rows[48], rows[49], rows[48], rows[49]);
2635 zip<1>(rows[50], rows[51], rows[50], rows[51]);
2636 zip<1>(rows[52], rows[53], rows[52], rows[53]);
2637 zip<1>(rows[54], rows[55], rows[54], rows[55]);
2638 zip<1>(rows[56], rows[57], rows[56], rows[57]);
2639 zip<1>(rows[58], rows[59], rows[58], rows[59]);
2640 zip<1>(rows[60], rows[61], rows[60], rows[61]);
2641 zip<1>(rows[62], rows[63], rows[62], rows[63]);
2642 zip<2>(rows[0], rows[2], rows[0], rows[2]);
2643 zip<2>(rows[1], rows[3], rows[1], rows[3]);
2644 zip<2>(rows[4], rows[6], rows[4], rows[6]);
2645 zip<2>(rows[5], rows[7], rows[5], rows[7]);
2646 zip<2>(rows[8], rows[10], rows[8], rows[10]);
2647 zip<2>(rows[9], rows[11], rows[9], rows[11]);
2648 zip<2>(rows[12], rows[14], rows[12], rows[14]);
2649 zip<2>(rows[13], rows[15], rows[13], rows[15]);
2650 zip<2>(rows[16], rows[18], rows[16], rows[18]);
2651 zip<2>(rows[17], rows[19], rows[17], rows[19]);
2652 zip<2>(rows[20], rows[22], rows[20], rows[22]);
2653 zip<2>(rows[21], rows[23], rows[21], rows[23]);
2654 zip<2>(rows[24], rows[26], rows[24], rows[26]);
2655 zip<2>(rows[25], rows[27], rows[25], rows[27]);
2656 zip<2>(rows[28], rows[30], rows[28], rows[30]);
2657 zip<2>(rows[29], rows[31], rows[29], rows[31]);
2658 zip<2>(rows[32], rows[34], rows[32], rows[34]);
2659 zip<2>(rows[33], rows[35], rows[33], rows[35]);
2660 zip<2>(rows[36], rows[38], rows[36], rows[38]);
2661 zip<2>(rows[37], rows[39], rows[37], rows[39]);
2662 zip<2>(rows[40], rows[42], rows[40], rows[42]);
2663 zip<2>(rows[41], rows[43], rows[41], rows[43]);
2664 zip<2>(rows[44], rows[46], rows[44], rows[46]);
2665 zip<2>(rows[45], rows[47], rows[45], rows[47]);
2666 zip<2>(rows[48], rows[50], rows[48], rows[50]);
2667 zip<2>(rows[49], rows[51], rows[49], rows[51]);
2668 zip<2>(rows[52], rows[54], rows[52], rows[54]);
2669 zip<2>(rows[53], rows[55], rows[53], rows[55]);
2670 zip<2>(rows[56], rows[58], rows[56], rows[58]);
2671 zip<2>(rows[57], rows[59], rows[57], rows[59]);
2672 zip<2>(rows[60], rows[62], rows[60], rows[62]);
2673 zip<2>(rows[61], rows[63], rows[61], rows[63]);
2674 zip<4>(rows[0], rows[4], rows[0], rows[4]);
2675 zip<4>(rows[2], rows[6], rows[2], rows[6]);
2676 zip<4>(rows[1], rows[5], rows[1], rows[5]);
2677 zip<4>(rows[3], rows[7], rows[3], rows[7]);
2678 zip<4>(rows[8], rows[12], rows[8], rows[12]);
2679 zip<4>(rows[10], rows[14], rows[10], rows[14]);
2680 zip<4>(rows[9], rows[13], rows[9], rows[13]);
2681 zip<4>(rows[11], rows[15], rows[11], rows[15]);
2682 zip<4>(rows[16], rows[20], rows[16], rows[20]);
2683 zip<4>(rows[18], rows[22], rows[18], rows[22]);
2684 zip<4>(rows[17], rows[21], rows[17], rows[21]);
2685 zip<4>(rows[19], rows[23], rows[19], rows[23]);
2686 zip<4>(rows[24], rows[28], rows[24], rows[28]);
2687 zip<4>(rows[26], rows[30], rows[26], rows[30]);
2688 zip<4>(rows[25], rows[29], rows[25], rows[29]);
2689 zip<4>(rows[27], rows[31], rows[27], rows[31]);
2690 zip<4>(rows[32], rows[36], rows[32], rows[36]);
2691 zip<4>(rows[34], rows[38], rows[34], rows[38]);
2692 zip<4>(rows[33], rows[37], rows[33], rows[37]);
2693 zip<4>(rows[35], rows[39], rows[35], rows[39]);
2694 zip<4>(rows[40], rows[44], rows[40], rows[44]);
2695 zip<4>(rows[42], rows[46], rows[42], rows[46]);
2696 zip<4>(rows[41], rows[45], rows[41], rows[45]);
2697 zip<4>(rows[43], rows[47], rows[43], rows[47]);
2698 zip<4>(rows[48], rows[52], rows[48], rows[52]);
2699 zip<4>(rows[50], rows[54], rows[50], rows[54]);
2700 zip<4>(rows[49], rows[53], rows[49], rows[53]);
2701 zip<4>(rows[51], rows[55], rows[51], rows[55]);
2702 zip<4>(rows[56], rows[60], rows[56], rows[60]);
2703 zip<4>(rows[58], rows[62], rows[58], rows[62]);
2704 zip<4>(rows[57], rows[61], rows[57], rows[61]);
2705 zip<4>(rows[59], rows[63], rows[59], rows[63]);
2706 zip<8>(rows[0], rows[8], rows[0], rows[8]);
2707 zip<8>(rows[4], rows[12], rows[4], rows[12]);
2708 zip<8>(rows[2], rows[10], rows[2], rows[10]);
2709 zip<8>(rows[6], rows[14], rows[6], rows[14]);
2710 zip<8>(rows[1], rows[9], rows[1], rows[9]);
2711 zip<8>(rows[5], rows[13], rows[5], rows[13]);
2712 zip<8>(rows[3], rows[11], rows[3], rows[11]);
2713 zip<8>(rows[7], rows[15], rows[7], rows[15]);
2714 zip<8>(rows[16], rows[24], rows[16], rows[24]);
2715 zip<8>(rows[20], rows[28], rows[20], rows[28]);
2716 zip<8>(rows[18], rows[26], rows[18], rows[26]);
2717 zip<8>(rows[22], rows[30], rows[22], rows[30]);
2718 zip<8>(rows[17], rows[25], rows[17], rows[25]);
2719 zip<8>(rows[21], rows[29], rows[21], rows[29]);
2720 zip<8>(rows[19], rows[27], rows[19], rows[27]);
2721 zip<8>(rows[23], rows[31], rows[23], rows[31]);
2722 zip<8>(rows[32], rows[40], rows[32], rows[40]);
2723 zip<8>(rows[36], rows[44], rows[36], rows[44]);
2724 zip<8>(rows[34], rows[42], rows[34], rows[42]);
2725 zip<8>(rows[38], rows[46], rows[38], rows[46]);
2726 zip<8>(rows[33], rows[41], rows[33], rows[41]);
2727 zip<8>(rows[37], rows[45], rows[37], rows[45]);
2728 zip<8>(rows[35], rows[43], rows[35], rows[43]);
2729 zip<8>(rows[39], rows[47], rows[39], rows[47]);
2730 zip<8>(rows[48], rows[56], rows[48], rows[56]);
2731 zip<8>(rows[52], rows[60], rows[52], rows[60]);
2732 zip<8>(rows[50], rows[58], rows[50], rows[58]);
2733 zip<8>(rows[54], rows[62], rows[54], rows[62]);
2734 zip<8>(rows[49], rows[57], rows[49], rows[57]);
2735 zip<8>(rows[53], rows[61], rows[53], rows[61]);
2736 zip<8>(rows[51], rows[59], rows[51], rows[59]);
2737 zip<8>(rows[55], rows[63], rows[55], rows[63]);
2738 zip<16>(rows[0], rows[16], rows[0], rows[16]);
2739 zip<16>(rows[8], rows[24], rows[8], rows[24]);
2740 zip<16>(rows[4], rows[20], rows[4], rows[20]);
2741 zip<16>(rows[12], rows[28], rows[12], rows[28]);
2742 zip<16>(rows[2], rows[18], rows[2], rows[18]);
2743 zip<16>(rows[10], rows[26], rows[10], rows[26]);
2744 zip<16>(rows[6], rows[22], rows[6], rows[22]);
2745 zip<16>(rows[14], rows[30], rows[14], rows[30]);
2746 zip<16>(rows[1], rows[17], rows[1], rows[17]);
2747 zip<16>(rows[9], rows[25], rows[9], rows[25]);
2748 zip<16>(rows[5], rows[21], rows[5], rows[21]);
2749 zip<16>(rows[13], rows[29], rows[13], rows[29]);
2750 zip<16>(rows[3], rows[19], rows[3], rows[19]);
2751 zip<16>(rows[11], rows[27], rows[11], rows[27]);
2752 zip<16>(rows[7], rows[23], rows[7], rows[23]);
2753 zip<16>(rows[15], rows[31], rows[15], rows[31]);
2754 zip<16>(rows[32], rows[48], rows[32], rows[48]);
2755 zip<16>(rows[40], rows[56], rows[40], rows[56]);
2756 zip<16>(rows[36], rows[52], rows[36], rows[52]);
2757 zip<16>(rows[44], rows[60], rows[44], rows[60]);
2758 zip<16>(rows[34], rows[50], rows[34], rows[50]);
2759 zip<16>(rows[42], rows[58], rows[42], rows[58]);
2760 zip<16>(rows[38], rows[54], rows[38], rows[54]);
2761 zip<16>(rows[46], rows[62], rows[46], rows[62]);
2762 zip<16>(rows[33], rows[49], rows[33], rows[49]);
2763 zip<16>(rows[41], rows[57], rows[41], rows[57]);
2764 zip<16>(rows[37], rows[53], rows[37], rows[53]);
2765 zip<16>(rows[45], rows[61], rows[45], rows[61]);
2766 zip<16>(rows[35], rows[51], rows[35], rows[51]);
2767 zip<16>(rows[43], rows[59], rows[43], rows[59]);
2768 zip<16>(rows[39], rows[55], rows[39], rows[55]);
2769 zip<16>(rows[47], rows[63], rows[47], rows[63]);
2770 zip<32>(rows[0], rows[32], rows[0], rows[32]);
2771 zip<32>(rows[16], rows[48], rows[16], rows[48]);
2772 zip<32>(rows[8], rows[40], rows[8], rows[40]);
2773 zip<32>(rows[24], rows[56], rows[24], rows[56]);
2774 zip<32>(rows[4], rows[36], rows[4], rows[36]);
2775 zip<32>(rows[20], rows[52], rows[20], rows[52]);
2776 zip<32>(rows[12], rows[44], rows[12], rows[44]);
2777 zip<32>(rows[28], rows[60], rows[28], rows[60]);
2778 zip<32>(rows[2], rows[34], rows[2], rows[34]);
2779 zip<32>(rows[18], rows[50], rows[18], rows[50]);
2780 zip<32>(rows[10], rows[42], rows[10], rows[42]);
2781 zip<32>(rows[26], rows[58], rows[26], rows[58]);
2782 zip<32>(rows[6], rows[38], rows[6], rows[38]);
2783 zip<32>(rows[22], rows[54], rows[22], rows[54]);
2784 zip<32>(rows[14], rows[46], rows[14], rows[46]);
2785 zip<32>(rows[30], rows[62], rows[30], rows[62]);
2786 zip<32>(rows[1], rows[33], rows[1], rows[33]);
2787 zip<32>(rows[17], rows[49], rows[17], rows[49]);
2788 zip<32>(rows[9], rows[41], rows[9], rows[41]);
2789 zip<32>(rows[25], rows[57], rows[25], rows[57]);
2790 zip<32>(rows[5], rows[37], rows[5], rows[37]);
2791 zip<32>(rows[21], rows[53], rows[21], rows[53]);
2792 zip<32>(rows[13], rows[45], rows[13], rows[45]);
2793 zip<32>(rows[29], rows[61], rows[29], rows[61]);
2794 zip<32>(rows[3], rows[35], rows[3], rows[35]);
2795 zip<32>(rows[19], rows[51], rows[19], rows[51]);
2796 zip<32>(rows[11], rows[43], rows[11], rows[43]);
2797 zip<32>(rows[27], rows[59], rows[27], rows[59]);
2798 zip<32>(rows[7], rows[39], rows[7], rows[39]);
2799 zip<32>(rows[23], rows[55], rows[23], rows[55]);
2800 zip<32>(rows[15], rows[47], rows[15], rows[47]);
2801 zip<32>(rows[31], rows[63], rows[31], rows[63]);
2802 std::swap(rows[1], rows[32]);
2803 std::swap(rows[2], rows[16]);
2804 std::swap(rows[3], rows[48]);
2805 std::swap(rows[4], rows[8]);
2806 std::swap(rows[5], rows[40]);
2807 std::swap(rows[6], rows[24]);
2808 std::swap(rows[7], rows[56]);
2809 std::swap(rows[9], rows[36]);
2810 std::swap(rows[10], rows[20]);
2811 std::swap(rows[11], rows[52]);
2812 std::swap(rows[13], rows[44]);
2813 std::swap(rows[14], rows[28]);
2814 std::swap(rows[15], rows[60]);
2815 std::swap(rows[17], rows[34]);
2816 std::swap(rows[19], rows[50]);
2817 std::swap(rows[21], rows[42]);
2818 std::swap(rows[22], rows[26]);
2819 std::swap(rows[23], rows[58]);
2820 std::swap(rows[25], rows[38]);
2821 std::swap(rows[27], rows[54]);
2822 std::swap(rows[29], rows[46]);
2823 std::swap(rows[31], rows[62]);
2824 std::swap(rows[35], rows[49]);
2825 std::swap(rows[37], rows[41]);
2826 std::swap(rows[39], rows[57]);
2827 std::swap(rows[43], rows[53]);
2828 std::swap(rows[47], rows[61]);
2829 std::swap(rows[55], rows[59]);
2830}
2831
2832template <typename T, size_t SIMD_WIDTH>
2833static SIMD_INLINE void transpose1inplc(
2834 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems])
2835{
2836 transpose1inplc(rows, Elements<Vec<T, SIMD_WIDTH>::elements>());
2837}
2838
2839// ==========================================================
2840// transpose1inplcLane (1-argument version)
2841// ==========================================================
2842
2843template <typename T, size_t SIMD_WIDTH>
2844static SIMD_INLINE void transpose1inplcLane(
2845 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>, Bytes<16>)
2846{
2847 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
2848 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
2849 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
2850 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
2851 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
2852 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
2853 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
2854 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
2855 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
2856 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
2857 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
2858 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
2859 zip16<2>(rows[8], rows[10], rows[8], rows[10]);
2860 zip16<2>(rows[9], rows[11], rows[9], rows[11]);
2861 zip16<2>(rows[12], rows[14], rows[12], rows[14]);
2862 zip16<2>(rows[13], rows[15], rows[13], rows[15]);
2863 zip16<4>(rows[0], rows[4], rows[0], rows[4]);
2864 zip16<4>(rows[2], rows[6], rows[2], rows[6]);
2865 zip16<4>(rows[1], rows[5], rows[1], rows[5]);
2866 zip16<4>(rows[3], rows[7], rows[3], rows[7]);
2867 zip16<4>(rows[8], rows[12], rows[8], rows[12]);
2868 zip16<4>(rows[10], rows[14], rows[10], rows[14]);
2869 zip16<4>(rows[9], rows[13], rows[9], rows[13]);
2870 zip16<4>(rows[11], rows[15], rows[11], rows[15]);
2871 zip16<8>(rows[0], rows[8], rows[0], rows[8]);
2872 zip16<8>(rows[4], rows[12], rows[4], rows[12]);
2873 zip16<8>(rows[2], rows[10], rows[2], rows[10]);
2874 zip16<8>(rows[6], rows[14], rows[6], rows[14]);
2875 zip16<8>(rows[1], rows[9], rows[1], rows[9]);
2876 zip16<8>(rows[5], rows[13], rows[5], rows[13]);
2877 zip16<8>(rows[3], rows[11], rows[3], rows[11]);
2878 zip16<8>(rows[7], rows[15], rows[7], rows[15]);
2879 std::swap(rows[1], rows[8]);
2880 std::swap(rows[2], rows[4]);
2881 std::swap(rows[3], rows[12]);
2882 std::swap(rows[5], rows[10]);
2883 std::swap(rows[7], rows[14]);
2884 std::swap(rows[11], rows[13]);
2885 // correction steps follow below (if required)
2886}
2887
2888template <typename T, size_t SIMD_WIDTH>
2889static SIMD_INLINE void transpose1inplcLane(
2890 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<16>)
2891{
2892 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
2893 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
2894 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
2895 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
2896 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
2897 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
2898 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
2899 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
2900 zip16<4>(rows[0], rows[4], rows[0], rows[4]);
2901 zip16<4>(rows[2], rows[6], rows[2], rows[6]);
2902 zip16<4>(rows[1], rows[5], rows[1], rows[5]);
2903 zip16<4>(rows[3], rows[7], rows[3], rows[7]);
2904 std::swap(rows[1], rows[4]);
2905 std::swap(rows[3], rows[6]);
2906 // correction steps follow below (if required)
2907}
2908
2909template <typename T, size_t SIMD_WIDTH>
2910static SIMD_INLINE void transpose1inplcLane(
2911 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<16>)
2912{
2913 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
2914 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
2915 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
2916 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
2917 std::swap(rows[1], rows[2]);
2918 // correction steps follow below (if required)
2919}
2920
2921template <typename T, size_t SIMD_WIDTH>
2922static SIMD_INLINE void transpose1inplcLane(
2923 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<2>, Bytes<16>)
2924{
2925 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
2926 // correction steps follow below (if required)
2927}
2928
2929template <typename T, size_t SIMD_WIDTH>
2930static SIMD_INLINE void transpose1inplcLane(
2931 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<32>, Bytes<32>)
2932{
2933 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
2934 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
2935 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
2936 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
2937 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
2938 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
2939 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
2940 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
2941 zip16<1>(rows[16], rows[17], rows[16], rows[17]);
2942 zip16<1>(rows[18], rows[19], rows[18], rows[19]);
2943 zip16<1>(rows[20], rows[21], rows[20], rows[21]);
2944 zip16<1>(rows[22], rows[23], rows[22], rows[23]);
2945 zip16<1>(rows[24], rows[25], rows[24], rows[25]);
2946 zip16<1>(rows[26], rows[27], rows[26], rows[27]);
2947 zip16<1>(rows[28], rows[29], rows[28], rows[29]);
2948 zip16<1>(rows[30], rows[31], rows[30], rows[31]);
2949 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
2950 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
2951 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
2952 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
2953 zip16<2>(rows[8], rows[10], rows[8], rows[10]);
2954 zip16<2>(rows[9], rows[11], rows[9], rows[11]);
2955 zip16<2>(rows[12], rows[14], rows[12], rows[14]);
2956 zip16<2>(rows[13], rows[15], rows[13], rows[15]);
2957 zip16<2>(rows[16], rows[18], rows[16], rows[18]);
2958 zip16<2>(rows[17], rows[19], rows[17], rows[19]);
2959 zip16<2>(rows[20], rows[22], rows[20], rows[22]);
2960 zip16<2>(rows[21], rows[23], rows[21], rows[23]);
2961 zip16<2>(rows[24], rows[26], rows[24], rows[26]);
2962 zip16<2>(rows[25], rows[27], rows[25], rows[27]);
2963 zip16<2>(rows[28], rows[30], rows[28], rows[30]);
2964 zip16<2>(rows[29], rows[31], rows[29], rows[31]);
2965 zip16<4>(rows[0], rows[4], rows[0], rows[4]);
2966 zip16<4>(rows[2], rows[6], rows[2], rows[6]);
2967 zip16<4>(rows[1], rows[5], rows[1], rows[5]);
2968 zip16<4>(rows[3], rows[7], rows[3], rows[7]);
2969 zip16<4>(rows[8], rows[12], rows[8], rows[12]);
2970 zip16<4>(rows[10], rows[14], rows[10], rows[14]);
2971 zip16<4>(rows[9], rows[13], rows[9], rows[13]);
2972 zip16<4>(rows[11], rows[15], rows[11], rows[15]);
2973 zip16<4>(rows[16], rows[20], rows[16], rows[20]);
2974 zip16<4>(rows[18], rows[22], rows[18], rows[22]);
2975 zip16<4>(rows[17], rows[21], rows[17], rows[21]);
2976 zip16<4>(rows[19], rows[23], rows[19], rows[23]);
2977 zip16<4>(rows[24], rows[28], rows[24], rows[28]);
2978 zip16<4>(rows[26], rows[30], rows[26], rows[30]);
2979 zip16<4>(rows[25], rows[29], rows[25], rows[29]);
2980 zip16<4>(rows[27], rows[31], rows[27], rows[31]);
2981 zip16<8>(rows[0], rows[8], rows[0], rows[8]);
2982 zip16<8>(rows[4], rows[12], rows[4], rows[12]);
2983 zip16<8>(rows[2], rows[10], rows[2], rows[10]);
2984 zip16<8>(rows[6], rows[14], rows[6], rows[14]);
2985 zip16<8>(rows[1], rows[9], rows[1], rows[9]);
2986 zip16<8>(rows[5], rows[13], rows[5], rows[13]);
2987 zip16<8>(rows[3], rows[11], rows[3], rows[11]);
2988 zip16<8>(rows[7], rows[15], rows[7], rows[15]);
2989 zip16<8>(rows[16], rows[24], rows[16], rows[24]);
2990 zip16<8>(rows[20], rows[28], rows[20], rows[28]);
2991 zip16<8>(rows[18], rows[26], rows[18], rows[26]);
2992 zip16<8>(rows[22], rows[30], rows[22], rows[30]);
2993 zip16<8>(rows[17], rows[25], rows[17], rows[25]);
2994 zip16<8>(rows[21], rows[29], rows[21], rows[29]);
2995 zip16<8>(rows[19], rows[27], rows[19], rows[27]);
2996 zip16<8>(rows[23], rows[31], rows[23], rows[31]);
2997 std::swap(rows[1], rows[8]);
2998 std::swap(rows[2], rows[4]);
2999 std::swap(rows[3], rows[12]);
3000 std::swap(rows[5], rows[10]);
3001 std::swap(rows[7], rows[14]);
3002 std::swap(rows[11], rows[13]);
3003 std::swap(rows[17], rows[24]);
3004 std::swap(rows[18], rows[20]);
3005 std::swap(rows[19], rows[28]);
3006 std::swap(rows[21], rows[26]);
3007 std::swap(rows[23], rows[30]);
3008 std::swap(rows[27], rows[29]);
3009 // correction steps follow below (if required)
3010 zip<16>(rows[0], rows[16], rows[0], rows[16]);
3011 zip<16>(rows[1], rows[17], rows[1], rows[17]);
3012 zip<16>(rows[2], rows[18], rows[2], rows[18]);
3013 zip<16>(rows[3], rows[19], rows[3], rows[19]);
3014 zip<16>(rows[4], rows[20], rows[4], rows[20]);
3015 zip<16>(rows[5], rows[21], rows[5], rows[21]);
3016 zip<16>(rows[6], rows[22], rows[6], rows[22]);
3017 zip<16>(rows[7], rows[23], rows[7], rows[23]);
3018 zip<16>(rows[8], rows[24], rows[8], rows[24]);
3019 zip<16>(rows[9], rows[25], rows[9], rows[25]);
3020 zip<16>(rows[10], rows[26], rows[10], rows[26]);
3021 zip<16>(rows[11], rows[27], rows[11], rows[27]);
3022 zip<16>(rows[12], rows[28], rows[12], rows[28]);
3023 zip<16>(rows[13], rows[29], rows[13], rows[29]);
3024 zip<16>(rows[14], rows[30], rows[14], rows[30]);
3025 zip<16>(rows[15], rows[31], rows[15], rows[31]);
3026}
3027
3028template <typename T, size_t SIMD_WIDTH>
3029static SIMD_INLINE void transpose1inplcLane(
3030 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>, Bytes<32>)
3031{
3032 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3033 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3034 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3035 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3036 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
3037 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
3038 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
3039 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
3040 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
3041 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
3042 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
3043 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
3044 zip16<2>(rows[8], rows[10], rows[8], rows[10]);
3045 zip16<2>(rows[9], rows[11], rows[9], rows[11]);
3046 zip16<2>(rows[12], rows[14], rows[12], rows[14]);
3047 zip16<2>(rows[13], rows[15], rows[13], rows[15]);
3048 zip16<4>(rows[0], rows[4], rows[0], rows[4]);
3049 zip16<4>(rows[2], rows[6], rows[2], rows[6]);
3050 zip16<4>(rows[1], rows[5], rows[1], rows[5]);
3051 zip16<4>(rows[3], rows[7], rows[3], rows[7]);
3052 zip16<4>(rows[8], rows[12], rows[8], rows[12]);
3053 zip16<4>(rows[10], rows[14], rows[10], rows[14]);
3054 zip16<4>(rows[9], rows[13], rows[9], rows[13]);
3055 zip16<4>(rows[11], rows[15], rows[11], rows[15]);
3056 std::swap(rows[1], rows[4]);
3057 std::swap(rows[3], rows[6]);
3058 std::swap(rows[9], rows[12]);
3059 std::swap(rows[11], rows[14]);
3060 // correction steps follow below (if required)
3061 zip<8>(rows[0], rows[8], rows[0], rows[8]);
3062 zip<8>(rows[1], rows[9], rows[1], rows[9]);
3063 zip<8>(rows[2], rows[10], rows[2], rows[10]);
3064 zip<8>(rows[3], rows[11], rows[3], rows[11]);
3065 zip<8>(rows[4], rows[12], rows[4], rows[12]);
3066 zip<8>(rows[5], rows[13], rows[5], rows[13]);
3067 zip<8>(rows[6], rows[14], rows[6], rows[14]);
3068 zip<8>(rows[7], rows[15], rows[7], rows[15]);
3069}
3070
3071template <typename T, size_t SIMD_WIDTH>
3072static SIMD_INLINE void transpose1inplcLane(
3073 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<32>)
3074{
3075 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3076 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3077 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3078 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3079 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
3080 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
3081 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
3082 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
3083 std::swap(rows[1], rows[2]);
3084 std::swap(rows[5], rows[6]);
3085 // correction steps follow below (if required)
3086 zip<4>(rows[0], rows[4], rows[0], rows[4]);
3087 zip<4>(rows[1], rows[5], rows[1], rows[5]);
3088 zip<4>(rows[2], rows[6], rows[2], rows[6]);
3089 zip<4>(rows[3], rows[7], rows[3], rows[7]);
3090}
3091
3092template <typename T, size_t SIMD_WIDTH>
3093static SIMD_INLINE void transpose1inplcLane(
3094 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<32>)
3095{
3096 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3097 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3098 // correction steps follow below (if required)
3099 zip<2>(rows[0], rows[2], rows[0], rows[2]);
3100 zip<2>(rows[1], rows[3], rows[1], rows[3]);
3101}
3102
3103template <typename T, size_t SIMD_WIDTH>
3104static SIMD_INLINE void transpose1inplcLane(
3105 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<64>, Bytes<64>)
3106{
3107 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3108 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3109 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3110 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3111 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
3112 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
3113 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
3114 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
3115 zip16<1>(rows[16], rows[17], rows[16], rows[17]);
3116 zip16<1>(rows[18], rows[19], rows[18], rows[19]);
3117 zip16<1>(rows[20], rows[21], rows[20], rows[21]);
3118 zip16<1>(rows[22], rows[23], rows[22], rows[23]);
3119 zip16<1>(rows[24], rows[25], rows[24], rows[25]);
3120 zip16<1>(rows[26], rows[27], rows[26], rows[27]);
3121 zip16<1>(rows[28], rows[29], rows[28], rows[29]);
3122 zip16<1>(rows[30], rows[31], rows[30], rows[31]);
3123 zip16<1>(rows[32], rows[33], rows[32], rows[33]);
3124 zip16<1>(rows[34], rows[35], rows[34], rows[35]);
3125 zip16<1>(rows[36], rows[37], rows[36], rows[37]);
3126 zip16<1>(rows[38], rows[39], rows[38], rows[39]);
3127 zip16<1>(rows[40], rows[41], rows[40], rows[41]);
3128 zip16<1>(rows[42], rows[43], rows[42], rows[43]);
3129 zip16<1>(rows[44], rows[45], rows[44], rows[45]);
3130 zip16<1>(rows[46], rows[47], rows[46], rows[47]);
3131 zip16<1>(rows[48], rows[49], rows[48], rows[49]);
3132 zip16<1>(rows[50], rows[51], rows[50], rows[51]);
3133 zip16<1>(rows[52], rows[53], rows[52], rows[53]);
3134 zip16<1>(rows[54], rows[55], rows[54], rows[55]);
3135 zip16<1>(rows[56], rows[57], rows[56], rows[57]);
3136 zip16<1>(rows[58], rows[59], rows[58], rows[59]);
3137 zip16<1>(rows[60], rows[61], rows[60], rows[61]);
3138 zip16<1>(rows[62], rows[63], rows[62], rows[63]);
3139 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
3140 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
3141 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
3142 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
3143 zip16<2>(rows[8], rows[10], rows[8], rows[10]);
3144 zip16<2>(rows[9], rows[11], rows[9], rows[11]);
3145 zip16<2>(rows[12], rows[14], rows[12], rows[14]);
3146 zip16<2>(rows[13], rows[15], rows[13], rows[15]);
3147 zip16<2>(rows[16], rows[18], rows[16], rows[18]);
3148 zip16<2>(rows[17], rows[19], rows[17], rows[19]);
3149 zip16<2>(rows[20], rows[22], rows[20], rows[22]);
3150 zip16<2>(rows[21], rows[23], rows[21], rows[23]);
3151 zip16<2>(rows[24], rows[26], rows[24], rows[26]);
3152 zip16<2>(rows[25], rows[27], rows[25], rows[27]);
3153 zip16<2>(rows[28], rows[30], rows[28], rows[30]);
3154 zip16<2>(rows[29], rows[31], rows[29], rows[31]);
3155 zip16<2>(rows[32], rows[34], rows[32], rows[34]);
3156 zip16<2>(rows[33], rows[35], rows[33], rows[35]);
3157 zip16<2>(rows[36], rows[38], rows[36], rows[38]);
3158 zip16<2>(rows[37], rows[39], rows[37], rows[39]);
3159 zip16<2>(rows[40], rows[42], rows[40], rows[42]);
3160 zip16<2>(rows[41], rows[43], rows[41], rows[43]);
3161 zip16<2>(rows[44], rows[46], rows[44], rows[46]);
3162 zip16<2>(rows[45], rows[47], rows[45], rows[47]);
3163 zip16<2>(rows[48], rows[50], rows[48], rows[50]);
3164 zip16<2>(rows[49], rows[51], rows[49], rows[51]);
3165 zip16<2>(rows[52], rows[54], rows[52], rows[54]);
3166 zip16<2>(rows[53], rows[55], rows[53], rows[55]);
3167 zip16<2>(rows[56], rows[58], rows[56], rows[58]);
3168 zip16<2>(rows[57], rows[59], rows[57], rows[59]);
3169 zip16<2>(rows[60], rows[62], rows[60], rows[62]);
3170 zip16<2>(rows[61], rows[63], rows[61], rows[63]);
3171 zip16<4>(rows[0], rows[4], rows[0], rows[4]);
3172 zip16<4>(rows[2], rows[6], rows[2], rows[6]);
3173 zip16<4>(rows[1], rows[5], rows[1], rows[5]);
3174 zip16<4>(rows[3], rows[7], rows[3], rows[7]);
3175 zip16<4>(rows[8], rows[12], rows[8], rows[12]);
3176 zip16<4>(rows[10], rows[14], rows[10], rows[14]);
3177 zip16<4>(rows[9], rows[13], rows[9], rows[13]);
3178 zip16<4>(rows[11], rows[15], rows[11], rows[15]);
3179 zip16<4>(rows[16], rows[20], rows[16], rows[20]);
3180 zip16<4>(rows[18], rows[22], rows[18], rows[22]);
3181 zip16<4>(rows[17], rows[21], rows[17], rows[21]);
3182 zip16<4>(rows[19], rows[23], rows[19], rows[23]);
3183 zip16<4>(rows[24], rows[28], rows[24], rows[28]);
3184 zip16<4>(rows[26], rows[30], rows[26], rows[30]);
3185 zip16<4>(rows[25], rows[29], rows[25], rows[29]);
3186 zip16<4>(rows[27], rows[31], rows[27], rows[31]);
3187 zip16<4>(rows[32], rows[36], rows[32], rows[36]);
3188 zip16<4>(rows[34], rows[38], rows[34], rows[38]);
3189 zip16<4>(rows[33], rows[37], rows[33], rows[37]);
3190 zip16<4>(rows[35], rows[39], rows[35], rows[39]);
3191 zip16<4>(rows[40], rows[44], rows[40], rows[44]);
3192 zip16<4>(rows[42], rows[46], rows[42], rows[46]);
3193 zip16<4>(rows[41], rows[45], rows[41], rows[45]);
3194 zip16<4>(rows[43], rows[47], rows[43], rows[47]);
3195 zip16<4>(rows[48], rows[52], rows[48], rows[52]);
3196 zip16<4>(rows[50], rows[54], rows[50], rows[54]);
3197 zip16<4>(rows[49], rows[53], rows[49], rows[53]);
3198 zip16<4>(rows[51], rows[55], rows[51], rows[55]);
3199 zip16<4>(rows[56], rows[60], rows[56], rows[60]);
3200 zip16<4>(rows[58], rows[62], rows[58], rows[62]);
3201 zip16<4>(rows[57], rows[61], rows[57], rows[61]);
3202 zip16<4>(rows[59], rows[63], rows[59], rows[63]);
3203 zip16<8>(rows[0], rows[8], rows[0], rows[8]);
3204 zip16<8>(rows[4], rows[12], rows[4], rows[12]);
3205 zip16<8>(rows[2], rows[10], rows[2], rows[10]);
3206 zip16<8>(rows[6], rows[14], rows[6], rows[14]);
3207 zip16<8>(rows[1], rows[9], rows[1], rows[9]);
3208 zip16<8>(rows[5], rows[13], rows[5], rows[13]);
3209 zip16<8>(rows[3], rows[11], rows[3], rows[11]);
3210 zip16<8>(rows[7], rows[15], rows[7], rows[15]);
3211 zip16<8>(rows[16], rows[24], rows[16], rows[24]);
3212 zip16<8>(rows[20], rows[28], rows[20], rows[28]);
3213 zip16<8>(rows[18], rows[26], rows[18], rows[26]);
3214 zip16<8>(rows[22], rows[30], rows[22], rows[30]);
3215 zip16<8>(rows[17], rows[25], rows[17], rows[25]);
3216 zip16<8>(rows[21], rows[29], rows[21], rows[29]);
3217 zip16<8>(rows[19], rows[27], rows[19], rows[27]);
3218 zip16<8>(rows[23], rows[31], rows[23], rows[31]);
3219 zip16<8>(rows[32], rows[40], rows[32], rows[40]);
3220 zip16<8>(rows[36], rows[44], rows[36], rows[44]);
3221 zip16<8>(rows[34], rows[42], rows[34], rows[42]);
3222 zip16<8>(rows[38], rows[46], rows[38], rows[46]);
3223 zip16<8>(rows[33], rows[41], rows[33], rows[41]);
3224 zip16<8>(rows[37], rows[45], rows[37], rows[45]);
3225 zip16<8>(rows[35], rows[43], rows[35], rows[43]);
3226 zip16<8>(rows[39], rows[47], rows[39], rows[47]);
3227 zip16<8>(rows[48], rows[56], rows[48], rows[56]);
3228 zip16<8>(rows[52], rows[60], rows[52], rows[60]);
3229 zip16<8>(rows[50], rows[58], rows[50], rows[58]);
3230 zip16<8>(rows[54], rows[62], rows[54], rows[62]);
3231 zip16<8>(rows[49], rows[57], rows[49], rows[57]);
3232 zip16<8>(rows[53], rows[61], rows[53], rows[61]);
3233 zip16<8>(rows[51], rows[59], rows[51], rows[59]);
3234 zip16<8>(rows[55], rows[63], rows[55], rows[63]);
3235 std::swap(rows[1], rows[8]);
3236 std::swap(rows[2], rows[4]);
3237 std::swap(rows[3], rows[12]);
3238 std::swap(rows[5], rows[10]);
3239 std::swap(rows[7], rows[14]);
3240 std::swap(rows[11], rows[13]);
3241 std::swap(rows[17], rows[24]);
3242 std::swap(rows[18], rows[20]);
3243 std::swap(rows[19], rows[28]);
3244 std::swap(rows[21], rows[26]);
3245 std::swap(rows[23], rows[30]);
3246 std::swap(rows[27], rows[29]);
3247 std::swap(rows[33], rows[40]);
3248 std::swap(rows[34], rows[36]);
3249 std::swap(rows[35], rows[44]);
3250 std::swap(rows[37], rows[42]);
3251 std::swap(rows[39], rows[46]);
3252 std::swap(rows[43], rows[45]);
3253 std::swap(rows[49], rows[56]);
3254 std::swap(rows[50], rows[52]);
3255 std::swap(rows[51], rows[60]);
3256 std::swap(rows[53], rows[58]);
3257 std::swap(rows[55], rows[62]);
3258 std::swap(rows[59], rows[61]);
3259 // correction steps follow below (if required)
3260 zip<16>(rows[0], rows[16], rows[0], rows[16]);
3261 zip<16>(rows[1], rows[17], rows[1], rows[17]);
3262 zip<16>(rows[2], rows[18], rows[2], rows[18]);
3263 zip<16>(rows[3], rows[19], rows[3], rows[19]);
3264 zip<16>(rows[4], rows[20], rows[4], rows[20]);
3265 zip<16>(rows[5], rows[21], rows[5], rows[21]);
3266 zip<16>(rows[6], rows[22], rows[6], rows[22]);
3267 zip<16>(rows[7], rows[23], rows[7], rows[23]);
3268 zip<16>(rows[8], rows[24], rows[8], rows[24]);
3269 zip<16>(rows[9], rows[25], rows[9], rows[25]);
3270 zip<16>(rows[10], rows[26], rows[10], rows[26]);
3271 zip<16>(rows[11], rows[27], rows[11], rows[27]);
3272 zip<16>(rows[12], rows[28], rows[12], rows[28]);
3273 zip<16>(rows[13], rows[29], rows[13], rows[29]);
3274 zip<16>(rows[14], rows[30], rows[14], rows[30]);
3275 zip<16>(rows[15], rows[31], rows[15], rows[31]);
3276 zip<16>(rows[32], rows[48], rows[32], rows[48]);
3277 zip<16>(rows[33], rows[49], rows[33], rows[49]);
3278 zip<16>(rows[34], rows[50], rows[34], rows[50]);
3279 zip<16>(rows[35], rows[51], rows[35], rows[51]);
3280 zip<16>(rows[36], rows[52], rows[36], rows[52]);
3281 zip<16>(rows[37], rows[53], rows[37], rows[53]);
3282 zip<16>(rows[38], rows[54], rows[38], rows[54]);
3283 zip<16>(rows[39], rows[55], rows[39], rows[55]);
3284 zip<16>(rows[40], rows[56], rows[40], rows[56]);
3285 zip<16>(rows[41], rows[57], rows[41], rows[57]);
3286 zip<16>(rows[42], rows[58], rows[42], rows[58]);
3287 zip<16>(rows[43], rows[59], rows[43], rows[59]);
3288 zip<16>(rows[44], rows[60], rows[44], rows[60]);
3289 zip<16>(rows[45], rows[61], rows[45], rows[61]);
3290 zip<16>(rows[46], rows[62], rows[46], rows[62]);
3291 zip<16>(rows[47], rows[63], rows[47], rows[63]);
3292 zip<32>(rows[0], rows[32], rows[0], rows[32]);
3293 zip<32>(rows[1], rows[33], rows[1], rows[33]);
3294 zip<32>(rows[2], rows[34], rows[2], rows[34]);
3295 zip<32>(rows[3], rows[35], rows[3], rows[35]);
3296 zip<32>(rows[4], rows[36], rows[4], rows[36]);
3297 zip<32>(rows[5], rows[37], rows[5], rows[37]);
3298 zip<32>(rows[6], rows[38], rows[6], rows[38]);
3299 zip<32>(rows[7], rows[39], rows[7], rows[39]);
3300 zip<32>(rows[8], rows[40], rows[8], rows[40]);
3301 zip<32>(rows[9], rows[41], rows[9], rows[41]);
3302 zip<32>(rows[10], rows[42], rows[10], rows[42]);
3303 zip<32>(rows[11], rows[43], rows[11], rows[43]);
3304 zip<32>(rows[12], rows[44], rows[12], rows[44]);
3305 zip<32>(rows[13], rows[45], rows[13], rows[45]);
3306 zip<32>(rows[14], rows[46], rows[14], rows[46]);
3307 zip<32>(rows[15], rows[47], rows[15], rows[47]);
3308 zip<32>(rows[16], rows[48], rows[16], rows[48]);
3309 zip<32>(rows[17], rows[49], rows[17], rows[49]);
3310 zip<32>(rows[18], rows[50], rows[18], rows[50]);
3311 zip<32>(rows[19], rows[51], rows[19], rows[51]);
3312 zip<32>(rows[20], rows[52], rows[20], rows[52]);
3313 zip<32>(rows[21], rows[53], rows[21], rows[53]);
3314 zip<32>(rows[22], rows[54], rows[22], rows[54]);
3315 zip<32>(rows[23], rows[55], rows[23], rows[55]);
3316 zip<32>(rows[24], rows[56], rows[24], rows[56]);
3317 zip<32>(rows[25], rows[57], rows[25], rows[57]);
3318 zip<32>(rows[26], rows[58], rows[26], rows[58]);
3319 zip<32>(rows[27], rows[59], rows[27], rows[59]);
3320 zip<32>(rows[28], rows[60], rows[28], rows[60]);
3321 zip<32>(rows[29], rows[61], rows[29], rows[61]);
3322 zip<32>(rows[30], rows[62], rows[30], rows[62]);
3323 zip<32>(rows[31], rows[63], rows[31], rows[63]);
3324 std::swap(rows[16], rows[32]);
3325 std::swap(rows[17], rows[33]);
3326 std::swap(rows[18], rows[34]);
3327 std::swap(rows[19], rows[35]);
3328 std::swap(rows[20], rows[36]);
3329 std::swap(rows[21], rows[37]);
3330 std::swap(rows[22], rows[38]);
3331 std::swap(rows[23], rows[39]);
3332 std::swap(rows[24], rows[40]);
3333 std::swap(rows[25], rows[41]);
3334 std::swap(rows[26], rows[42]);
3335 std::swap(rows[27], rows[43]);
3336 std::swap(rows[28], rows[44]);
3337 std::swap(rows[29], rows[45]);
3338 std::swap(rows[30], rows[46]);
3339 std::swap(rows[31], rows[47]);
3340}
3341
3342template <typename T, size_t SIMD_WIDTH>
3343static SIMD_INLINE void transpose1inplcLane(
3344 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<32>, Bytes<64>)
3345{
3346 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3347 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3348 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3349 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3350 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
3351 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
3352 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
3353 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
3354 zip16<1>(rows[16], rows[17], rows[16], rows[17]);
3355 zip16<1>(rows[18], rows[19], rows[18], rows[19]);
3356 zip16<1>(rows[20], rows[21], rows[20], rows[21]);
3357 zip16<1>(rows[22], rows[23], rows[22], rows[23]);
3358 zip16<1>(rows[24], rows[25], rows[24], rows[25]);
3359 zip16<1>(rows[26], rows[27], rows[26], rows[27]);
3360 zip16<1>(rows[28], rows[29], rows[28], rows[29]);
3361 zip16<1>(rows[30], rows[31], rows[30], rows[31]);
3362 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
3363 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
3364 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
3365 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
3366 zip16<2>(rows[8], rows[10], rows[8], rows[10]);
3367 zip16<2>(rows[9], rows[11], rows[9], rows[11]);
3368 zip16<2>(rows[12], rows[14], rows[12], rows[14]);
3369 zip16<2>(rows[13], rows[15], rows[13], rows[15]);
3370 zip16<2>(rows[16], rows[18], rows[16], rows[18]);
3371 zip16<2>(rows[17], rows[19], rows[17], rows[19]);
3372 zip16<2>(rows[20], rows[22], rows[20], rows[22]);
3373 zip16<2>(rows[21], rows[23], rows[21], rows[23]);
3374 zip16<2>(rows[24], rows[26], rows[24], rows[26]);
3375 zip16<2>(rows[25], rows[27], rows[25], rows[27]);
3376 zip16<2>(rows[28], rows[30], rows[28], rows[30]);
3377 zip16<2>(rows[29], rows[31], rows[29], rows[31]);
3378 zip16<4>(rows[0], rows[4], rows[0], rows[4]);
3379 zip16<4>(rows[2], rows[6], rows[2], rows[6]);
3380 zip16<4>(rows[1], rows[5], rows[1], rows[5]);
3381 zip16<4>(rows[3], rows[7], rows[3], rows[7]);
3382 zip16<4>(rows[8], rows[12], rows[8], rows[12]);
3383 zip16<4>(rows[10], rows[14], rows[10], rows[14]);
3384 zip16<4>(rows[9], rows[13], rows[9], rows[13]);
3385 zip16<4>(rows[11], rows[15], rows[11], rows[15]);
3386 zip16<4>(rows[16], rows[20], rows[16], rows[20]);
3387 zip16<4>(rows[18], rows[22], rows[18], rows[22]);
3388 zip16<4>(rows[17], rows[21], rows[17], rows[21]);
3389 zip16<4>(rows[19], rows[23], rows[19], rows[23]);
3390 zip16<4>(rows[24], rows[28], rows[24], rows[28]);
3391 zip16<4>(rows[26], rows[30], rows[26], rows[30]);
3392 zip16<4>(rows[25], rows[29], rows[25], rows[29]);
3393 zip16<4>(rows[27], rows[31], rows[27], rows[31]);
3394 std::swap(rows[1], rows[4]);
3395 std::swap(rows[3], rows[6]);
3396 std::swap(rows[9], rows[12]);
3397 std::swap(rows[11], rows[14]);
3398 std::swap(rows[17], rows[20]);
3399 std::swap(rows[19], rows[22]);
3400 std::swap(rows[25], rows[28]);
3401 std::swap(rows[27], rows[30]);
3402 // correction steps follow below (if required)
3403 zip<8>(rows[0], rows[8], rows[0], rows[8]);
3404 zip<8>(rows[1], rows[9], rows[1], rows[9]);
3405 zip<8>(rows[2], rows[10], rows[2], rows[10]);
3406 zip<8>(rows[3], rows[11], rows[3], rows[11]);
3407 zip<8>(rows[4], rows[12], rows[4], rows[12]);
3408 zip<8>(rows[5], rows[13], rows[5], rows[13]);
3409 zip<8>(rows[6], rows[14], rows[6], rows[14]);
3410 zip<8>(rows[7], rows[15], rows[7], rows[15]);
3411 zip<8>(rows[16], rows[24], rows[16], rows[24]);
3412 zip<8>(rows[17], rows[25], rows[17], rows[25]);
3413 zip<8>(rows[18], rows[26], rows[18], rows[26]);
3414 zip<8>(rows[19], rows[27], rows[19], rows[27]);
3415 zip<8>(rows[20], rows[28], rows[20], rows[28]);
3416 zip<8>(rows[21], rows[29], rows[21], rows[29]);
3417 zip<8>(rows[22], rows[30], rows[22], rows[30]);
3418 zip<8>(rows[23], rows[31], rows[23], rows[31]);
3419 zip<16>(rows[0], rows[16], rows[0], rows[16]);
3420 zip<16>(rows[1], rows[17], rows[1], rows[17]);
3421 zip<16>(rows[2], rows[18], rows[2], rows[18]);
3422 zip<16>(rows[3], rows[19], rows[3], rows[19]);
3423 zip<16>(rows[4], rows[20], rows[4], rows[20]);
3424 zip<16>(rows[5], rows[21], rows[5], rows[21]);
3425 zip<16>(rows[6], rows[22], rows[6], rows[22]);
3426 zip<16>(rows[7], rows[23], rows[7], rows[23]);
3427 zip<16>(rows[8], rows[24], rows[8], rows[24]);
3428 zip<16>(rows[9], rows[25], rows[9], rows[25]);
3429 zip<16>(rows[10], rows[26], rows[10], rows[26]);
3430 zip<16>(rows[11], rows[27], rows[11], rows[27]);
3431 zip<16>(rows[12], rows[28], rows[12], rows[28]);
3432 zip<16>(rows[13], rows[29], rows[13], rows[29]);
3433 zip<16>(rows[14], rows[30], rows[14], rows[30]);
3434 zip<16>(rows[15], rows[31], rows[15], rows[31]);
3435 std::swap(rows[8], rows[16]);
3436 std::swap(rows[9], rows[17]);
3437 std::swap(rows[10], rows[18]);
3438 std::swap(rows[11], rows[19]);
3439 std::swap(rows[12], rows[20]);
3440 std::swap(rows[13], rows[21]);
3441 std::swap(rows[14], rows[22]);
3442 std::swap(rows[15], rows[23]);
3443}
3444
3445template <typename T, size_t SIMD_WIDTH>
3446static SIMD_INLINE void transpose1inplcLane(
3447 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>, Bytes<64>)
3448{
3449 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3450 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3451 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3452 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3453 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
3454 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
3455 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
3456 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
3457 zip16<2>(rows[0], rows[2], rows[0], rows[2]);
3458 zip16<2>(rows[1], rows[3], rows[1], rows[3]);
3459 zip16<2>(rows[4], rows[6], rows[4], rows[6]);
3460 zip16<2>(rows[5], rows[7], rows[5], rows[7]);
3461 zip16<2>(rows[8], rows[10], rows[8], rows[10]);
3462 zip16<2>(rows[9], rows[11], rows[9], rows[11]);
3463 zip16<2>(rows[12], rows[14], rows[12], rows[14]);
3464 zip16<2>(rows[13], rows[15], rows[13], rows[15]);
3465 std::swap(rows[1], rows[2]);
3466 std::swap(rows[5], rows[6]);
3467 std::swap(rows[9], rows[10]);
3468 std::swap(rows[13], rows[14]);
3469 // correction steps follow below (if required)
3470 zip<4>(rows[0], rows[4], rows[0], rows[4]);
3471 zip<4>(rows[1], rows[5], rows[1], rows[5]);
3472 zip<4>(rows[2], rows[6], rows[2], rows[6]);
3473 zip<4>(rows[3], rows[7], rows[3], rows[7]);
3474 zip<4>(rows[8], rows[12], rows[8], rows[12]);
3475 zip<4>(rows[9], rows[13], rows[9], rows[13]);
3476 zip<4>(rows[10], rows[14], rows[10], rows[14]);
3477 zip<4>(rows[11], rows[15], rows[11], rows[15]);
3478 zip<8>(rows[0], rows[8], rows[0], rows[8]);
3479 zip<8>(rows[1], rows[9], rows[1], rows[9]);
3480 zip<8>(rows[2], rows[10], rows[2], rows[10]);
3481 zip<8>(rows[3], rows[11], rows[3], rows[11]);
3482 zip<8>(rows[4], rows[12], rows[4], rows[12]);
3483 zip<8>(rows[5], rows[13], rows[5], rows[13]);
3484 zip<8>(rows[6], rows[14], rows[6], rows[14]);
3485 zip<8>(rows[7], rows[15], rows[7], rows[15]);
3486 std::swap(rows[4], rows[8]);
3487 std::swap(rows[5], rows[9]);
3488 std::swap(rows[6], rows[10]);
3489 std::swap(rows[7], rows[11]);
3490}
3491
3492template <typename T, size_t SIMD_WIDTH>
3493static SIMD_INLINE void transpose1inplcLane(
3494 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<64>)
3495{
3496 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3497 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3498 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3499 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3500 // correction steps follow below (if required)
3501 zip<2>(rows[0], rows[2], rows[0], rows[2]);
3502 zip<2>(rows[1], rows[3], rows[1], rows[3]);
3503 zip<2>(rows[4], rows[6], rows[4], rows[6]);
3504 zip<2>(rows[5], rows[7], rows[5], rows[7]);
3505 zip<4>(rows[0], rows[4], rows[0], rows[4]);
3506 zip<4>(rows[1], rows[5], rows[1], rows[5]);
3507 zip<4>(rows[2], rows[6], rows[2], rows[6]);
3508 zip<4>(rows[3], rows[7], rows[3], rows[7]);
3509 std::swap(rows[2], rows[4]);
3510 std::swap(rows[3], rows[5]);
3511}
3512
3513template <typename T, size_t SIMD_WIDTH>
3514static SIMD_INLINE void transpose1inplcLane(
3515 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems])
3516{
3517 transpose1inplcLane(rows, Elements<Vec<T, SIMD_WIDTH>::elements>(),
3518 Bytes<SIMD_WIDTH>());
3519}
3520
3521// ==========================================================
3522// transpose2inplc (1-argument version)
3523// ==========================================================
3524
3525template <typename T, size_t SIMD_WIDTH>
3526static SIMD_INLINE void transpose2inplc(
3527 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<2>)
3528{
3529 zip<1>(rows[0], rows[1], rows[0], rows[1]);
3530}
3531
3532template <typename T, size_t SIMD_WIDTH>
3533static SIMD_INLINE void transpose2inplc(
3534 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<4>)
3535{
3536 zip<1>(rows[0], rows[2], rows[0], rows[2]);
3537 zip<1>(rows[1], rows[3], rows[1], rows[3]);
3538 zip<1>(rows[0], rows[1], rows[0], rows[1]);
3539 zip<1>(rows[2], rows[3], rows[2], rows[3]);
3540}
3541
3542template <typename T, size_t SIMD_WIDTH>
3543static SIMD_INLINE void transpose2inplc(
3544 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>)
3545{
3546 zip<1>(rows[0], rows[4], rows[0], rows[4]);
3547 zip<1>(rows[1], rows[5], rows[1], rows[5]);
3548 zip<1>(rows[2], rows[6], rows[2], rows[6]);
3549 zip<1>(rows[3], rows[7], rows[3], rows[7]);
3550 zip<1>(rows[0], rows[2], rows[0], rows[2]);
3551 zip<1>(rows[4], rows[6], rows[4], rows[6]);
3552 zip<1>(rows[1], rows[3], rows[1], rows[3]);
3553 zip<1>(rows[5], rows[7], rows[5], rows[7]);
3554 zip<1>(rows[0], rows[1], rows[0], rows[1]);
3555 zip<1>(rows[2], rows[3], rows[2], rows[3]);
3556 zip<1>(rows[4], rows[5], rows[4], rows[5]);
3557 zip<1>(rows[6], rows[7], rows[6], rows[7]);
3558}
3559
3560template <typename T, size_t SIMD_WIDTH>
3561static SIMD_INLINE void transpose2inplc(
3562 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>)
3563{
3564 zip<1>(rows[0], rows[8], rows[0], rows[8]);
3565 zip<1>(rows[1], rows[9], rows[1], rows[9]);
3566 zip<1>(rows[2], rows[10], rows[2], rows[10]);
3567 zip<1>(rows[3], rows[11], rows[3], rows[11]);
3568 zip<1>(rows[4], rows[12], rows[4], rows[12]);
3569 zip<1>(rows[5], rows[13], rows[5], rows[13]);
3570 zip<1>(rows[6], rows[14], rows[6], rows[14]);
3571 zip<1>(rows[7], rows[15], rows[7], rows[15]);
3572 zip<1>(rows[0], rows[4], rows[0], rows[4]);
3573 zip<1>(rows[8], rows[12], rows[8], rows[12]);
3574 zip<1>(rows[1], rows[5], rows[1], rows[5]);
3575 zip<1>(rows[9], rows[13], rows[9], rows[13]);
3576 zip<1>(rows[2], rows[6], rows[2], rows[6]);
3577 zip<1>(rows[10], rows[14], rows[10], rows[14]);
3578 zip<1>(rows[3], rows[7], rows[3], rows[7]);
3579 zip<1>(rows[11], rows[15], rows[11], rows[15]);
3580 zip<1>(rows[0], rows[2], rows[0], rows[2]);
3581 zip<1>(rows[4], rows[6], rows[4], rows[6]);
3582 zip<1>(rows[8], rows[10], rows[8], rows[10]);
3583 zip<1>(rows[12], rows[14], rows[12], rows[14]);
3584 zip<1>(rows[1], rows[3], rows[1], rows[3]);
3585 zip<1>(rows[5], rows[7], rows[5], rows[7]);
3586 zip<1>(rows[9], rows[11], rows[9], rows[11]);
3587 zip<1>(rows[13], rows[15], rows[13], rows[15]);
3588 zip<1>(rows[0], rows[1], rows[0], rows[1]);
3589 zip<1>(rows[2], rows[3], rows[2], rows[3]);
3590 zip<1>(rows[4], rows[5], rows[4], rows[5]);
3591 zip<1>(rows[6], rows[7], rows[6], rows[7]);
3592 zip<1>(rows[8], rows[9], rows[8], rows[9]);
3593 zip<1>(rows[10], rows[11], rows[10], rows[11]);
3594 zip<1>(rows[12], rows[13], rows[12], rows[13]);
3595 zip<1>(rows[14], rows[15], rows[14], rows[15]);
3596}
3597
3598template <typename T, size_t SIMD_WIDTH>
3599static SIMD_INLINE void transpose2inplc(
3600 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<32>)
3601{
3602 zip<1>(rows[0], rows[16], rows[0], rows[16]);
3603 zip<1>(rows[1], rows[17], rows[1], rows[17]);
3604 zip<1>(rows[2], rows[18], rows[2], rows[18]);
3605 zip<1>(rows[3], rows[19], rows[3], rows[19]);
3606 zip<1>(rows[4], rows[20], rows[4], rows[20]);
3607 zip<1>(rows[5], rows[21], rows[5], rows[21]);
3608 zip<1>(rows[6], rows[22], rows[6], rows[22]);
3609 zip<1>(rows[7], rows[23], rows[7], rows[23]);
3610 zip<1>(rows[8], rows[24], rows[8], rows[24]);
3611 zip<1>(rows[9], rows[25], rows[9], rows[25]);
3612 zip<1>(rows[10], rows[26], rows[10], rows[26]);
3613 zip<1>(rows[11], rows[27], rows[11], rows[27]);
3614 zip<1>(rows[12], rows[28], rows[12], rows[28]);
3615 zip<1>(rows[13], rows[29], rows[13], rows[29]);
3616 zip<1>(rows[14], rows[30], rows[14], rows[30]);
3617 zip<1>(rows[15], rows[31], rows[15], rows[31]);
3618 zip<1>(rows[0], rows[8], rows[0], rows[8]);
3619 zip<1>(rows[16], rows[24], rows[16], rows[24]);
3620 zip<1>(rows[1], rows[9], rows[1], rows[9]);
3621 zip<1>(rows[17], rows[25], rows[17], rows[25]);
3622 zip<1>(rows[2], rows[10], rows[2], rows[10]);
3623 zip<1>(rows[18], rows[26], rows[18], rows[26]);
3624 zip<1>(rows[3], rows[11], rows[3], rows[11]);
3625 zip<1>(rows[19], rows[27], rows[19], rows[27]);
3626 zip<1>(rows[4], rows[12], rows[4], rows[12]);
3627 zip<1>(rows[20], rows[28], rows[20], rows[28]);
3628 zip<1>(rows[5], rows[13], rows[5], rows[13]);
3629 zip<1>(rows[21], rows[29], rows[21], rows[29]);
3630 zip<1>(rows[6], rows[14], rows[6], rows[14]);
3631 zip<1>(rows[22], rows[30], rows[22], rows[30]);
3632 zip<1>(rows[7], rows[15], rows[7], rows[15]);
3633 zip<1>(rows[23], rows[31], rows[23], rows[31]);
3634 zip<1>(rows[0], rows[4], rows[0], rows[4]);
3635 zip<1>(rows[8], rows[12], rows[8], rows[12]);
3636 zip<1>(rows[16], rows[20], rows[16], rows[20]);
3637 zip<1>(rows[24], rows[28], rows[24], rows[28]);
3638 zip<1>(rows[1], rows[5], rows[1], rows[5]);
3639 zip<1>(rows[9], rows[13], rows[9], rows[13]);
3640 zip<1>(rows[17], rows[21], rows[17], rows[21]);
3641 zip<1>(rows[25], rows[29], rows[25], rows[29]);
3642 zip<1>(rows[2], rows[6], rows[2], rows[6]);
3643 zip<1>(rows[10], rows[14], rows[10], rows[14]);
3644 zip<1>(rows[18], rows[22], rows[18], rows[22]);
3645 zip<1>(rows[26], rows[30], rows[26], rows[30]);
3646 zip<1>(rows[3], rows[7], rows[3], rows[7]);
3647 zip<1>(rows[11], rows[15], rows[11], rows[15]);
3648 zip<1>(rows[19], rows[23], rows[19], rows[23]);
3649 zip<1>(rows[27], rows[31], rows[27], rows[31]);
3650 zip<1>(rows[0], rows[2], rows[0], rows[2]);
3651 zip<1>(rows[4], rows[6], rows[4], rows[6]);
3652 zip<1>(rows[8], rows[10], rows[8], rows[10]);
3653 zip<1>(rows[12], rows[14], rows[12], rows[14]);
3654 zip<1>(rows[16], rows[18], rows[16], rows[18]);
3655 zip<1>(rows[20], rows[22], rows[20], rows[22]);
3656 zip<1>(rows[24], rows[26], rows[24], rows[26]);
3657 zip<1>(rows[28], rows[30], rows[28], rows[30]);
3658 zip<1>(rows[1], rows[3], rows[1], rows[3]);
3659 zip<1>(rows[5], rows[7], rows[5], rows[7]);
3660 zip<1>(rows[9], rows[11], rows[9], rows[11]);
3661 zip<1>(rows[13], rows[15], rows[13], rows[15]);
3662 zip<1>(rows[17], rows[19], rows[17], rows[19]);
3663 zip<1>(rows[21], rows[23], rows[21], rows[23]);
3664 zip<1>(rows[25], rows[27], rows[25], rows[27]);
3665 zip<1>(rows[29], rows[31], rows[29], rows[31]);
3666 zip<1>(rows[0], rows[1], rows[0], rows[1]);
3667 zip<1>(rows[2], rows[3], rows[2], rows[3]);
3668 zip<1>(rows[4], rows[5], rows[4], rows[5]);
3669 zip<1>(rows[6], rows[7], rows[6], rows[7]);
3670 zip<1>(rows[8], rows[9], rows[8], rows[9]);
3671 zip<1>(rows[10], rows[11], rows[10], rows[11]);
3672 zip<1>(rows[12], rows[13], rows[12], rows[13]);
3673 zip<1>(rows[14], rows[15], rows[14], rows[15]);
3674 zip<1>(rows[16], rows[17], rows[16], rows[17]);
3675 zip<1>(rows[18], rows[19], rows[18], rows[19]);
3676 zip<1>(rows[20], rows[21], rows[20], rows[21]);
3677 zip<1>(rows[22], rows[23], rows[22], rows[23]);
3678 zip<1>(rows[24], rows[25], rows[24], rows[25]);
3679 zip<1>(rows[26], rows[27], rows[26], rows[27]);
3680 zip<1>(rows[28], rows[29], rows[28], rows[29]);
3681 zip<1>(rows[30], rows[31], rows[30], rows[31]);
3682}
3683
3684template <typename T, size_t SIMD_WIDTH>
3685static SIMD_INLINE void transpose2inplc(
3686 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<64>)
3687{
3688 zip<1>(rows[0], rows[32], rows[0], rows[32]);
3689 zip<1>(rows[1], rows[33], rows[1], rows[33]);
3690 zip<1>(rows[2], rows[34], rows[2], rows[34]);
3691 zip<1>(rows[3], rows[35], rows[3], rows[35]);
3692 zip<1>(rows[4], rows[36], rows[4], rows[36]);
3693 zip<1>(rows[5], rows[37], rows[5], rows[37]);
3694 zip<1>(rows[6], rows[38], rows[6], rows[38]);
3695 zip<1>(rows[7], rows[39], rows[7], rows[39]);
3696 zip<1>(rows[8], rows[40], rows[8], rows[40]);
3697 zip<1>(rows[9], rows[41], rows[9], rows[41]);
3698 zip<1>(rows[10], rows[42], rows[10], rows[42]);
3699 zip<1>(rows[11], rows[43], rows[11], rows[43]);
3700 zip<1>(rows[12], rows[44], rows[12], rows[44]);
3701 zip<1>(rows[13], rows[45], rows[13], rows[45]);
3702 zip<1>(rows[14], rows[46], rows[14], rows[46]);
3703 zip<1>(rows[15], rows[47], rows[15], rows[47]);
3704 zip<1>(rows[16], rows[48], rows[16], rows[48]);
3705 zip<1>(rows[17], rows[49], rows[17], rows[49]);
3706 zip<1>(rows[18], rows[50], rows[18], rows[50]);
3707 zip<1>(rows[19], rows[51], rows[19], rows[51]);
3708 zip<1>(rows[20], rows[52], rows[20], rows[52]);
3709 zip<1>(rows[21], rows[53], rows[21], rows[53]);
3710 zip<1>(rows[22], rows[54], rows[22], rows[54]);
3711 zip<1>(rows[23], rows[55], rows[23], rows[55]);
3712 zip<1>(rows[24], rows[56], rows[24], rows[56]);
3713 zip<1>(rows[25], rows[57], rows[25], rows[57]);
3714 zip<1>(rows[26], rows[58], rows[26], rows[58]);
3715 zip<1>(rows[27], rows[59], rows[27], rows[59]);
3716 zip<1>(rows[28], rows[60], rows[28], rows[60]);
3717 zip<1>(rows[29], rows[61], rows[29], rows[61]);
3718 zip<1>(rows[30], rows[62], rows[30], rows[62]);
3719 zip<1>(rows[31], rows[63], rows[31], rows[63]);
3720 zip<1>(rows[0], rows[16], rows[0], rows[16]);
3721 zip<1>(rows[32], rows[48], rows[32], rows[48]);
3722 zip<1>(rows[1], rows[17], rows[1], rows[17]);
3723 zip<1>(rows[33], rows[49], rows[33], rows[49]);
3724 zip<1>(rows[2], rows[18], rows[2], rows[18]);
3725 zip<1>(rows[34], rows[50], rows[34], rows[50]);
3726 zip<1>(rows[3], rows[19], rows[3], rows[19]);
3727 zip<1>(rows[35], rows[51], rows[35], rows[51]);
3728 zip<1>(rows[4], rows[20], rows[4], rows[20]);
3729 zip<1>(rows[36], rows[52], rows[36], rows[52]);
3730 zip<1>(rows[5], rows[21], rows[5], rows[21]);
3731 zip<1>(rows[37], rows[53], rows[37], rows[53]);
3732 zip<1>(rows[6], rows[22], rows[6], rows[22]);
3733 zip<1>(rows[38], rows[54], rows[38], rows[54]);
3734 zip<1>(rows[7], rows[23], rows[7], rows[23]);
3735 zip<1>(rows[39], rows[55], rows[39], rows[55]);
3736 zip<1>(rows[8], rows[24], rows[8], rows[24]);
3737 zip<1>(rows[40], rows[56], rows[40], rows[56]);
3738 zip<1>(rows[9], rows[25], rows[9], rows[25]);
3739 zip<1>(rows[41], rows[57], rows[41], rows[57]);
3740 zip<1>(rows[10], rows[26], rows[10], rows[26]);
3741 zip<1>(rows[42], rows[58], rows[42], rows[58]);
3742 zip<1>(rows[11], rows[27], rows[11], rows[27]);
3743 zip<1>(rows[43], rows[59], rows[43], rows[59]);
3744 zip<1>(rows[12], rows[28], rows[12], rows[28]);
3745 zip<1>(rows[44], rows[60], rows[44], rows[60]);
3746 zip<1>(rows[13], rows[29], rows[13], rows[29]);
3747 zip<1>(rows[45], rows[61], rows[45], rows[61]);
3748 zip<1>(rows[14], rows[30], rows[14], rows[30]);
3749 zip<1>(rows[46], rows[62], rows[46], rows[62]);
3750 zip<1>(rows[15], rows[31], rows[15], rows[31]);
3751 zip<1>(rows[47], rows[63], rows[47], rows[63]);
3752 zip<1>(rows[0], rows[8], rows[0], rows[8]);
3753 zip<1>(rows[16], rows[24], rows[16], rows[24]);
3754 zip<1>(rows[32], rows[40], rows[32], rows[40]);
3755 zip<1>(rows[48], rows[56], rows[48], rows[56]);
3756 zip<1>(rows[1], rows[9], rows[1], rows[9]);
3757 zip<1>(rows[17], rows[25], rows[17], rows[25]);
3758 zip<1>(rows[33], rows[41], rows[33], rows[41]);
3759 zip<1>(rows[49], rows[57], rows[49], rows[57]);
3760 zip<1>(rows[2], rows[10], rows[2], rows[10]);
3761 zip<1>(rows[18], rows[26], rows[18], rows[26]);
3762 zip<1>(rows[34], rows[42], rows[34], rows[42]);
3763 zip<1>(rows[50], rows[58], rows[50], rows[58]);
3764 zip<1>(rows[3], rows[11], rows[3], rows[11]);
3765 zip<1>(rows[19], rows[27], rows[19], rows[27]);
3766 zip<1>(rows[35], rows[43], rows[35], rows[43]);
3767 zip<1>(rows[51], rows[59], rows[51], rows[59]);
3768 zip<1>(rows[4], rows[12], rows[4], rows[12]);
3769 zip<1>(rows[20], rows[28], rows[20], rows[28]);
3770 zip<1>(rows[36], rows[44], rows[36], rows[44]);
3771 zip<1>(rows[52], rows[60], rows[52], rows[60]);
3772 zip<1>(rows[5], rows[13], rows[5], rows[13]);
3773 zip<1>(rows[21], rows[29], rows[21], rows[29]);
3774 zip<1>(rows[37], rows[45], rows[37], rows[45]);
3775 zip<1>(rows[53], rows[61], rows[53], rows[61]);
3776 zip<1>(rows[6], rows[14], rows[6], rows[14]);
3777 zip<1>(rows[22], rows[30], rows[22], rows[30]);
3778 zip<1>(rows[38], rows[46], rows[38], rows[46]);
3779 zip<1>(rows[54], rows[62], rows[54], rows[62]);
3780 zip<1>(rows[7], rows[15], rows[7], rows[15]);
3781 zip<1>(rows[23], rows[31], rows[23], rows[31]);
3782 zip<1>(rows[39], rows[47], rows[39], rows[47]);
3783 zip<1>(rows[55], rows[63], rows[55], rows[63]);
3784 zip<1>(rows[0], rows[4], rows[0], rows[4]);
3785 zip<1>(rows[8], rows[12], rows[8], rows[12]);
3786 zip<1>(rows[16], rows[20], rows[16], rows[20]);
3787 zip<1>(rows[24], rows[28], rows[24], rows[28]);
3788 zip<1>(rows[32], rows[36], rows[32], rows[36]);
3789 zip<1>(rows[40], rows[44], rows[40], rows[44]);
3790 zip<1>(rows[48], rows[52], rows[48], rows[52]);
3791 zip<1>(rows[56], rows[60], rows[56], rows[60]);
3792 zip<1>(rows[1], rows[5], rows[1], rows[5]);
3793 zip<1>(rows[9], rows[13], rows[9], rows[13]);
3794 zip<1>(rows[17], rows[21], rows[17], rows[21]);
3795 zip<1>(rows[25], rows[29], rows[25], rows[29]);
3796 zip<1>(rows[33], rows[37], rows[33], rows[37]);
3797 zip<1>(rows[41], rows[45], rows[41], rows[45]);
3798 zip<1>(rows[49], rows[53], rows[49], rows[53]);
3799 zip<1>(rows[57], rows[61], rows[57], rows[61]);
3800 zip<1>(rows[2], rows[6], rows[2], rows[6]);
3801 zip<1>(rows[10], rows[14], rows[10], rows[14]);
3802 zip<1>(rows[18], rows[22], rows[18], rows[22]);
3803 zip<1>(rows[26], rows[30], rows[26], rows[30]);
3804 zip<1>(rows[34], rows[38], rows[34], rows[38]);
3805 zip<1>(rows[42], rows[46], rows[42], rows[46]);
3806 zip<1>(rows[50], rows[54], rows[50], rows[54]);
3807 zip<1>(rows[58], rows[62], rows[58], rows[62]);
3808 zip<1>(rows[3], rows[7], rows[3], rows[7]);
3809 zip<1>(rows[11], rows[15], rows[11], rows[15]);
3810 zip<1>(rows[19], rows[23], rows[19], rows[23]);
3811 zip<1>(rows[27], rows[31], rows[27], rows[31]);
3812 zip<1>(rows[35], rows[39], rows[35], rows[39]);
3813 zip<1>(rows[43], rows[47], rows[43], rows[47]);
3814 zip<1>(rows[51], rows[55], rows[51], rows[55]);
3815 zip<1>(rows[59], rows[63], rows[59], rows[63]);
3816 zip<1>(rows[0], rows[2], rows[0], rows[2]);
3817 zip<1>(rows[4], rows[6], rows[4], rows[6]);
3818 zip<1>(rows[8], rows[10], rows[8], rows[10]);
3819 zip<1>(rows[12], rows[14], rows[12], rows[14]);
3820 zip<1>(rows[16], rows[18], rows[16], rows[18]);
3821 zip<1>(rows[20], rows[22], rows[20], rows[22]);
3822 zip<1>(rows[24], rows[26], rows[24], rows[26]);
3823 zip<1>(rows[28], rows[30], rows[28], rows[30]);
3824 zip<1>(rows[32], rows[34], rows[32], rows[34]);
3825 zip<1>(rows[36], rows[38], rows[36], rows[38]);
3826 zip<1>(rows[40], rows[42], rows[40], rows[42]);
3827 zip<1>(rows[44], rows[46], rows[44], rows[46]);
3828 zip<1>(rows[48], rows[50], rows[48], rows[50]);
3829 zip<1>(rows[52], rows[54], rows[52], rows[54]);
3830 zip<1>(rows[56], rows[58], rows[56], rows[58]);
3831 zip<1>(rows[60], rows[62], rows[60], rows[62]);
3832 zip<1>(rows[1], rows[3], rows[1], rows[3]);
3833 zip<1>(rows[5], rows[7], rows[5], rows[7]);
3834 zip<1>(rows[9], rows[11], rows[9], rows[11]);
3835 zip<1>(rows[13], rows[15], rows[13], rows[15]);
3836 zip<1>(rows[17], rows[19], rows[17], rows[19]);
3837 zip<1>(rows[21], rows[23], rows[21], rows[23]);
3838 zip<1>(rows[25], rows[27], rows[25], rows[27]);
3839 zip<1>(rows[29], rows[31], rows[29], rows[31]);
3840 zip<1>(rows[33], rows[35], rows[33], rows[35]);
3841 zip<1>(rows[37], rows[39], rows[37], rows[39]);
3842 zip<1>(rows[41], rows[43], rows[41], rows[43]);
3843 zip<1>(rows[45], rows[47], rows[45], rows[47]);
3844 zip<1>(rows[49], rows[51], rows[49], rows[51]);
3845 zip<1>(rows[53], rows[55], rows[53], rows[55]);
3846 zip<1>(rows[57], rows[59], rows[57], rows[59]);
3847 zip<1>(rows[61], rows[63], rows[61], rows[63]);
3848 zip<1>(rows[0], rows[1], rows[0], rows[1]);
3849 zip<1>(rows[2], rows[3], rows[2], rows[3]);
3850 zip<1>(rows[4], rows[5], rows[4], rows[5]);
3851 zip<1>(rows[6], rows[7], rows[6], rows[7]);
3852 zip<1>(rows[8], rows[9], rows[8], rows[9]);
3853 zip<1>(rows[10], rows[11], rows[10], rows[11]);
3854 zip<1>(rows[12], rows[13], rows[12], rows[13]);
3855 zip<1>(rows[14], rows[15], rows[14], rows[15]);
3856 zip<1>(rows[16], rows[17], rows[16], rows[17]);
3857 zip<1>(rows[18], rows[19], rows[18], rows[19]);
3858 zip<1>(rows[20], rows[21], rows[20], rows[21]);
3859 zip<1>(rows[22], rows[23], rows[22], rows[23]);
3860 zip<1>(rows[24], rows[25], rows[24], rows[25]);
3861 zip<1>(rows[26], rows[27], rows[26], rows[27]);
3862 zip<1>(rows[28], rows[29], rows[28], rows[29]);
3863 zip<1>(rows[30], rows[31], rows[30], rows[31]);
3864 zip<1>(rows[32], rows[33], rows[32], rows[33]);
3865 zip<1>(rows[34], rows[35], rows[34], rows[35]);
3866 zip<1>(rows[36], rows[37], rows[36], rows[37]);
3867 zip<1>(rows[38], rows[39], rows[38], rows[39]);
3868 zip<1>(rows[40], rows[41], rows[40], rows[41]);
3869 zip<1>(rows[42], rows[43], rows[42], rows[43]);
3870 zip<1>(rows[44], rows[45], rows[44], rows[45]);
3871 zip<1>(rows[46], rows[47], rows[46], rows[47]);
3872 zip<1>(rows[48], rows[49], rows[48], rows[49]);
3873 zip<1>(rows[50], rows[51], rows[50], rows[51]);
3874 zip<1>(rows[52], rows[53], rows[52], rows[53]);
3875 zip<1>(rows[54], rows[55], rows[54], rows[55]);
3876 zip<1>(rows[56], rows[57], rows[56], rows[57]);
3877 zip<1>(rows[58], rows[59], rows[58], rows[59]);
3878 zip<1>(rows[60], rows[61], rows[60], rows[61]);
3879 zip<1>(rows[62], rows[63], rows[62], rows[63]);
3880}
3881
3882template <typename T, size_t SIMD_WIDTH>
3883static SIMD_INLINE void transpose2inplc(
3884 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems])
3885{
3886 transpose2inplc(rows, Elements<Vec<T, SIMD_WIDTH>::elements>());
3887}
3888
3889// ==========================================================
3890// transpose2inplcLane (1-argument version)
3891// ==========================================================
3892
3893template <typename T, size_t SIMD_WIDTH>
3894static SIMD_INLINE void transpose2inplcLane(
3895 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>, Bytes<16>)
3896{
3897 zip16<1>(rows[0], rows[8], rows[0], rows[8]);
3898 zip16<1>(rows[1], rows[9], rows[1], rows[9]);
3899 zip16<1>(rows[2], rows[10], rows[2], rows[10]);
3900 zip16<1>(rows[3], rows[11], rows[3], rows[11]);
3901 zip16<1>(rows[4], rows[12], rows[4], rows[12]);
3902 zip16<1>(rows[5], rows[13], rows[5], rows[13]);
3903 zip16<1>(rows[6], rows[14], rows[6], rows[14]);
3904 zip16<1>(rows[7], rows[15], rows[7], rows[15]);
3905 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
3906 zip16<1>(rows[8], rows[12], rows[8], rows[12]);
3907 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
3908 zip16<1>(rows[9], rows[13], rows[9], rows[13]);
3909 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
3910 zip16<1>(rows[10], rows[14], rows[10], rows[14]);
3911 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
3912 zip16<1>(rows[11], rows[15], rows[11], rows[15]);
3913 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
3914 zip16<1>(rows[4], rows[6], rows[4], rows[6]);
3915 zip16<1>(rows[8], rows[10], rows[8], rows[10]);
3916 zip16<1>(rows[12], rows[14], rows[12], rows[14]);
3917 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
3918 zip16<1>(rows[5], rows[7], rows[5], rows[7]);
3919 zip16<1>(rows[9], rows[11], rows[9], rows[11]);
3920 zip16<1>(rows[13], rows[15], rows[13], rows[15]);
3921 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3922 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3923 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3924 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3925 zip16<1>(rows[8], rows[9], rows[8], rows[9]);
3926 zip16<1>(rows[10], rows[11], rows[10], rows[11]);
3927 zip16<1>(rows[12], rows[13], rows[12], rows[13]);
3928 zip16<1>(rows[14], rows[15], rows[14], rows[15]);
3929 // correction steps follow below (if required)
3930}
3931
3932template <typename T, size_t SIMD_WIDTH>
3933static SIMD_INLINE void transpose2inplcLane(
3934 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<16>)
3935{
3936 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
3937 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
3938 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
3939 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
3940 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
3941 zip16<1>(rows[4], rows[6], rows[4], rows[6]);
3942 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
3943 zip16<1>(rows[5], rows[7], rows[5], rows[7]);
3944 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3945 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3946 zip16<1>(rows[4], rows[5], rows[4], rows[5]);
3947 zip16<1>(rows[6], rows[7], rows[6], rows[7]);
3948 // correction steps follow below (if required)
3949}
3950
3951template <typename T, size_t SIMD_WIDTH>
3952static SIMD_INLINE void transpose2inplcLane(
3953 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<16>)
3954{
3955 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
3956 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
3957 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3958 zip16<1>(rows[2], rows[3], rows[2], rows[3]);
3959 // correction steps follow below (if required)
3960}
3961
3962template <typename T, size_t SIMD_WIDTH>
3963static SIMD_INLINE void transpose2inplcLane(
3964 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<2>, Bytes<16>)
3965{
3966 zip16<1>(rows[0], rows[1], rows[0], rows[1]);
3967 // correction steps follow below (if required)
3968}
3969
3970template <typename T, size_t SIMD_WIDTH>
3971static SIMD_INLINE void transpose2inplcLane(
3972 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<32>, Bytes<32>)
3973{
3974 zip16<1>(rows[0], rows[16], rows[0], rows[16]);
3975 zip16<1>(rows[1], rows[17], rows[1], rows[17]);
3976 zip16<1>(rows[2], rows[18], rows[2], rows[18]);
3977 zip16<1>(rows[3], rows[19], rows[3], rows[19]);
3978 zip16<1>(rows[4], rows[20], rows[4], rows[20]);
3979 zip16<1>(rows[5], rows[21], rows[5], rows[21]);
3980 zip16<1>(rows[6], rows[22], rows[6], rows[22]);
3981 zip16<1>(rows[7], rows[23], rows[7], rows[23]);
3982 zip16<1>(rows[8], rows[24], rows[8], rows[24]);
3983 zip16<1>(rows[9], rows[25], rows[9], rows[25]);
3984 zip16<1>(rows[10], rows[26], rows[10], rows[26]);
3985 zip16<1>(rows[11], rows[27], rows[11], rows[27]);
3986 zip16<1>(rows[12], rows[28], rows[12], rows[28]);
3987 zip16<1>(rows[13], rows[29], rows[13], rows[29]);
3988 zip16<1>(rows[14], rows[30], rows[14], rows[30]);
3989 zip16<1>(rows[15], rows[31], rows[15], rows[31]);
3990 zip16<1>(rows[0], rows[8], rows[0], rows[8]);
3991 zip16<1>(rows[16], rows[24], rows[16], rows[24]);
3992 zip16<1>(rows[1], rows[9], rows[1], rows[9]);
3993 zip16<1>(rows[17], rows[25], rows[17], rows[25]);
3994 zip16<1>(rows[2], rows[10], rows[2], rows[10]);
3995 zip16<1>(rows[18], rows[26], rows[18], rows[26]);
3996 zip16<1>(rows[3], rows[11], rows[3], rows[11]);
3997 zip16<1>(rows[19], rows[27], rows[19], rows[27]);
3998 zip16<1>(rows[4], rows[12], rows[4], rows[12]);
3999 zip16<1>(rows[20], rows[28], rows[20], rows[28]);
4000 zip16<1>(rows[5], rows[13], rows[5], rows[13]);
4001 zip16<1>(rows[21], rows[29], rows[21], rows[29]);
4002 zip16<1>(rows[6], rows[14], rows[6], rows[14]);
4003 zip16<1>(rows[22], rows[30], rows[22], rows[30]);
4004 zip16<1>(rows[7], rows[15], rows[7], rows[15]);
4005 zip16<1>(rows[23], rows[31], rows[23], rows[31]);
4006 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4007 zip16<1>(rows[8], rows[12], rows[8], rows[12]);
4008 zip16<1>(rows[16], rows[20], rows[16], rows[20]);
4009 zip16<1>(rows[24], rows[28], rows[24], rows[28]);
4010 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4011 zip16<1>(rows[9], rows[13], rows[9], rows[13]);
4012 zip16<1>(rows[17], rows[21], rows[17], rows[21]);
4013 zip16<1>(rows[25], rows[29], rows[25], rows[29]);
4014 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4015 zip16<1>(rows[10], rows[14], rows[10], rows[14]);
4016 zip16<1>(rows[18], rows[22], rows[18], rows[22]);
4017 zip16<1>(rows[26], rows[30], rows[26], rows[30]);
4018 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4019 zip16<1>(rows[11], rows[15], rows[11], rows[15]);
4020 zip16<1>(rows[19], rows[23], rows[19], rows[23]);
4021 zip16<1>(rows[27], rows[31], rows[27], rows[31]);
4022 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
4023 zip16<1>(rows[4], rows[6], rows[4], rows[6]);
4024 zip16<1>(rows[8], rows[10], rows[8], rows[10]);
4025 zip16<1>(rows[12], rows[14], rows[12], rows[14]);
4026 zip16<1>(rows[16], rows[18], rows[16], rows[18]);
4027 zip16<1>(rows[20], rows[22], rows[20], rows[22]);
4028 zip16<1>(rows[24], rows[26], rows[24], rows[26]);
4029 zip16<1>(rows[28], rows[30], rows[28], rows[30]);
4030 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
4031 zip16<1>(rows[5], rows[7], rows[5], rows[7]);
4032 zip16<1>(rows[9], rows[11], rows[9], rows[11]);
4033 zip16<1>(rows[13], rows[15], rows[13], rows[15]);
4034 zip16<1>(rows[17], rows[19], rows[17], rows[19]);
4035 zip16<1>(rows[21], rows[23], rows[21], rows[23]);
4036 zip16<1>(rows[25], rows[27], rows[25], rows[27]);
4037 zip16<1>(rows[29], rows[31], rows[29], rows[31]);
4038 // correction steps follow below (if required)
4039 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4040 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4041 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4042 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4043 zip<1>(rows[8], rows[9], rows[8], rows[9]);
4044 zip<1>(rows[10], rows[11], rows[10], rows[11]);
4045 zip<1>(rows[12], rows[13], rows[12], rows[13]);
4046 zip<1>(rows[14], rows[15], rows[14], rows[15]);
4047 zip<1>(rows[16], rows[17], rows[16], rows[17]);
4048 zip<1>(rows[18], rows[19], rows[18], rows[19]);
4049 zip<1>(rows[20], rows[21], rows[20], rows[21]);
4050 zip<1>(rows[22], rows[23], rows[22], rows[23]);
4051 zip<1>(rows[24], rows[25], rows[24], rows[25]);
4052 zip<1>(rows[26], rows[27], rows[26], rows[27]);
4053 zip<1>(rows[28], rows[29], rows[28], rows[29]);
4054 zip<1>(rows[30], rows[31], rows[30], rows[31]);
4055 {
4056 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4057 rows[1] = rows[2];
4058 rows[2] = rows[4];
4059 rows[4] = rows[8];
4060 rows[8] = rows[16];
4061 rows[16] = vec_v;
4062 }
4063 {
4064 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4065 rows[3] = rows[6];
4066 rows[6] = rows[12];
4067 rows[12] = rows[24];
4068 rows[24] = rows[17];
4069 rows[17] = vec_v;
4070 }
4071 {
4072 Vec<T, SIMD_WIDTH> vec_v = rows[5];
4073 rows[5] = rows[10];
4074 rows[10] = rows[20];
4075 rows[20] = rows[9];
4076 rows[9] = rows[18];
4077 rows[18] = vec_v;
4078 }
4079 {
4080 Vec<T, SIMD_WIDTH> vec_v = rows[7];
4081 rows[7] = rows[14];
4082 rows[14] = rows[28];
4083 rows[28] = rows[25];
4084 rows[25] = rows[19];
4085 rows[19] = vec_v;
4086 }
4087 {
4088 Vec<T, SIMD_WIDTH> vec_v = rows[11];
4089 rows[11] = rows[22];
4090 rows[22] = rows[13];
4091 rows[13] = rows[26];
4092 rows[26] = rows[21];
4093 rows[21] = vec_v;
4094 }
4095 {
4096 Vec<T, SIMD_WIDTH> vec_v = rows[15];
4097 rows[15] = rows[30];
4098 rows[30] = rows[29];
4099 rows[29] = rows[27];
4100 rows[27] = rows[23];
4101 rows[23] = vec_v;
4102 }
4103}
4104
4105template <typename T, size_t SIMD_WIDTH>
4106static SIMD_INLINE void transpose2inplcLane(
4107 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>, Bytes<32>)
4108{
4109 zip16<1>(rows[0], rows[8], rows[0], rows[8]);
4110 zip16<1>(rows[1], rows[9], rows[1], rows[9]);
4111 zip16<1>(rows[2], rows[10], rows[2], rows[10]);
4112 zip16<1>(rows[3], rows[11], rows[3], rows[11]);
4113 zip16<1>(rows[4], rows[12], rows[4], rows[12]);
4114 zip16<1>(rows[5], rows[13], rows[5], rows[13]);
4115 zip16<1>(rows[6], rows[14], rows[6], rows[14]);
4116 zip16<1>(rows[7], rows[15], rows[7], rows[15]);
4117 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4118 zip16<1>(rows[8], rows[12], rows[8], rows[12]);
4119 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4120 zip16<1>(rows[9], rows[13], rows[9], rows[13]);
4121 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4122 zip16<1>(rows[10], rows[14], rows[10], rows[14]);
4123 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4124 zip16<1>(rows[11], rows[15], rows[11], rows[15]);
4125 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
4126 zip16<1>(rows[4], rows[6], rows[4], rows[6]);
4127 zip16<1>(rows[8], rows[10], rows[8], rows[10]);
4128 zip16<1>(rows[12], rows[14], rows[12], rows[14]);
4129 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
4130 zip16<1>(rows[5], rows[7], rows[5], rows[7]);
4131 zip16<1>(rows[9], rows[11], rows[9], rows[11]);
4132 zip16<1>(rows[13], rows[15], rows[13], rows[15]);
4133 // correction steps follow below (if required)
4134 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4135 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4136 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4137 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4138 zip<1>(rows[8], rows[9], rows[8], rows[9]);
4139 zip<1>(rows[10], rows[11], rows[10], rows[11]);
4140 zip<1>(rows[12], rows[13], rows[12], rows[13]);
4141 zip<1>(rows[14], rows[15], rows[14], rows[15]);
4142 {
4143 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4144 rows[1] = rows[2];
4145 rows[2] = rows[4];
4146 rows[4] = rows[8];
4147 rows[8] = vec_v;
4148 }
4149 {
4150 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4151 rows[3] = rows[6];
4152 rows[6] = rows[12];
4153 rows[12] = rows[9];
4154 rows[9] = vec_v;
4155 }
4156 {
4157 Vec<T, SIMD_WIDTH> vec_v = rows[5];
4158 rows[5] = rows[10];
4159 rows[10] = vec_v;
4160 }
4161 {
4162 Vec<T, SIMD_WIDTH> vec_v = rows[7];
4163 rows[7] = rows[14];
4164 rows[14] = rows[13];
4165 rows[13] = rows[11];
4166 rows[11] = vec_v;
4167 }
4168}
4169
4170template <typename T, size_t SIMD_WIDTH>
4171static SIMD_INLINE void transpose2inplcLane(
4172 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<32>)
4173{
4174 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4175 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4176 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4177 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4178 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
4179 zip16<1>(rows[4], rows[6], rows[4], rows[6]);
4180 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
4181 zip16<1>(rows[5], rows[7], rows[5], rows[7]);
4182 // correction steps follow below (if required)
4183 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4184 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4185 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4186 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4187 {
4188 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4189 rows[1] = rows[2];
4190 rows[2] = rows[4];
4191 rows[4] = vec_v;
4192 }
4193 {
4194 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4195 rows[3] = rows[6];
4196 rows[6] = rows[5];
4197 rows[5] = vec_v;
4198 }
4199}
4200
4201template <typename T, size_t SIMD_WIDTH>
4202static SIMD_INLINE void transpose2inplcLane(
4203 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<4>, Bytes<32>)
4204{
4205 zip16<1>(rows[0], rows[2], rows[0], rows[2]);
4206 zip16<1>(rows[1], rows[3], rows[1], rows[3]);
4207 // correction steps follow below (if required)
4208 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4209 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4210 {
4211 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4212 rows[1] = rows[2];
4213 rows[2] = vec_v;
4214 }
4215}
4216
4217template <typename T, size_t SIMD_WIDTH>
4218static SIMD_INLINE void transpose2inplcLane(
4219 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<64>, Bytes<64>)
4220{
4221 zip16<1>(rows[0], rows[32], rows[0], rows[32]);
4222 zip16<1>(rows[1], rows[33], rows[1], rows[33]);
4223 zip16<1>(rows[2], rows[34], rows[2], rows[34]);
4224 zip16<1>(rows[3], rows[35], rows[3], rows[35]);
4225 zip16<1>(rows[4], rows[36], rows[4], rows[36]);
4226 zip16<1>(rows[5], rows[37], rows[5], rows[37]);
4227 zip16<1>(rows[6], rows[38], rows[6], rows[38]);
4228 zip16<1>(rows[7], rows[39], rows[7], rows[39]);
4229 zip16<1>(rows[8], rows[40], rows[8], rows[40]);
4230 zip16<1>(rows[9], rows[41], rows[9], rows[41]);
4231 zip16<1>(rows[10], rows[42], rows[10], rows[42]);
4232 zip16<1>(rows[11], rows[43], rows[11], rows[43]);
4233 zip16<1>(rows[12], rows[44], rows[12], rows[44]);
4234 zip16<1>(rows[13], rows[45], rows[13], rows[45]);
4235 zip16<1>(rows[14], rows[46], rows[14], rows[46]);
4236 zip16<1>(rows[15], rows[47], rows[15], rows[47]);
4237 zip16<1>(rows[16], rows[48], rows[16], rows[48]);
4238 zip16<1>(rows[17], rows[49], rows[17], rows[49]);
4239 zip16<1>(rows[18], rows[50], rows[18], rows[50]);
4240 zip16<1>(rows[19], rows[51], rows[19], rows[51]);
4241 zip16<1>(rows[20], rows[52], rows[20], rows[52]);
4242 zip16<1>(rows[21], rows[53], rows[21], rows[53]);
4243 zip16<1>(rows[22], rows[54], rows[22], rows[54]);
4244 zip16<1>(rows[23], rows[55], rows[23], rows[55]);
4245 zip16<1>(rows[24], rows[56], rows[24], rows[56]);
4246 zip16<1>(rows[25], rows[57], rows[25], rows[57]);
4247 zip16<1>(rows[26], rows[58], rows[26], rows[58]);
4248 zip16<1>(rows[27], rows[59], rows[27], rows[59]);
4249 zip16<1>(rows[28], rows[60], rows[28], rows[60]);
4250 zip16<1>(rows[29], rows[61], rows[29], rows[61]);
4251 zip16<1>(rows[30], rows[62], rows[30], rows[62]);
4252 zip16<1>(rows[31], rows[63], rows[31], rows[63]);
4253 zip16<1>(rows[0], rows[16], rows[0], rows[16]);
4254 zip16<1>(rows[32], rows[48], rows[32], rows[48]);
4255 zip16<1>(rows[1], rows[17], rows[1], rows[17]);
4256 zip16<1>(rows[33], rows[49], rows[33], rows[49]);
4257 zip16<1>(rows[2], rows[18], rows[2], rows[18]);
4258 zip16<1>(rows[34], rows[50], rows[34], rows[50]);
4259 zip16<1>(rows[3], rows[19], rows[3], rows[19]);
4260 zip16<1>(rows[35], rows[51], rows[35], rows[51]);
4261 zip16<1>(rows[4], rows[20], rows[4], rows[20]);
4262 zip16<1>(rows[36], rows[52], rows[36], rows[52]);
4263 zip16<1>(rows[5], rows[21], rows[5], rows[21]);
4264 zip16<1>(rows[37], rows[53], rows[37], rows[53]);
4265 zip16<1>(rows[6], rows[22], rows[6], rows[22]);
4266 zip16<1>(rows[38], rows[54], rows[38], rows[54]);
4267 zip16<1>(rows[7], rows[23], rows[7], rows[23]);
4268 zip16<1>(rows[39], rows[55], rows[39], rows[55]);
4269 zip16<1>(rows[8], rows[24], rows[8], rows[24]);
4270 zip16<1>(rows[40], rows[56], rows[40], rows[56]);
4271 zip16<1>(rows[9], rows[25], rows[9], rows[25]);
4272 zip16<1>(rows[41], rows[57], rows[41], rows[57]);
4273 zip16<1>(rows[10], rows[26], rows[10], rows[26]);
4274 zip16<1>(rows[42], rows[58], rows[42], rows[58]);
4275 zip16<1>(rows[11], rows[27], rows[11], rows[27]);
4276 zip16<1>(rows[43], rows[59], rows[43], rows[59]);
4277 zip16<1>(rows[12], rows[28], rows[12], rows[28]);
4278 zip16<1>(rows[44], rows[60], rows[44], rows[60]);
4279 zip16<1>(rows[13], rows[29], rows[13], rows[29]);
4280 zip16<1>(rows[45], rows[61], rows[45], rows[61]);
4281 zip16<1>(rows[14], rows[30], rows[14], rows[30]);
4282 zip16<1>(rows[46], rows[62], rows[46], rows[62]);
4283 zip16<1>(rows[15], rows[31], rows[15], rows[31]);
4284 zip16<1>(rows[47], rows[63], rows[47], rows[63]);
4285 zip16<1>(rows[0], rows[8], rows[0], rows[8]);
4286 zip16<1>(rows[16], rows[24], rows[16], rows[24]);
4287 zip16<1>(rows[32], rows[40], rows[32], rows[40]);
4288 zip16<1>(rows[48], rows[56], rows[48], rows[56]);
4289 zip16<1>(rows[1], rows[9], rows[1], rows[9]);
4290 zip16<1>(rows[17], rows[25], rows[17], rows[25]);
4291 zip16<1>(rows[33], rows[41], rows[33], rows[41]);
4292 zip16<1>(rows[49], rows[57], rows[49], rows[57]);
4293 zip16<1>(rows[2], rows[10], rows[2], rows[10]);
4294 zip16<1>(rows[18], rows[26], rows[18], rows[26]);
4295 zip16<1>(rows[34], rows[42], rows[34], rows[42]);
4296 zip16<1>(rows[50], rows[58], rows[50], rows[58]);
4297 zip16<1>(rows[3], rows[11], rows[3], rows[11]);
4298 zip16<1>(rows[19], rows[27], rows[19], rows[27]);
4299 zip16<1>(rows[35], rows[43], rows[35], rows[43]);
4300 zip16<1>(rows[51], rows[59], rows[51], rows[59]);
4301 zip16<1>(rows[4], rows[12], rows[4], rows[12]);
4302 zip16<1>(rows[20], rows[28], rows[20], rows[28]);
4303 zip16<1>(rows[36], rows[44], rows[36], rows[44]);
4304 zip16<1>(rows[52], rows[60], rows[52], rows[60]);
4305 zip16<1>(rows[5], rows[13], rows[5], rows[13]);
4306 zip16<1>(rows[21], rows[29], rows[21], rows[29]);
4307 zip16<1>(rows[37], rows[45], rows[37], rows[45]);
4308 zip16<1>(rows[53], rows[61], rows[53], rows[61]);
4309 zip16<1>(rows[6], rows[14], rows[6], rows[14]);
4310 zip16<1>(rows[22], rows[30], rows[22], rows[30]);
4311 zip16<1>(rows[38], rows[46], rows[38], rows[46]);
4312 zip16<1>(rows[54], rows[62], rows[54], rows[62]);
4313 zip16<1>(rows[7], rows[15], rows[7], rows[15]);
4314 zip16<1>(rows[23], rows[31], rows[23], rows[31]);
4315 zip16<1>(rows[39], rows[47], rows[39], rows[47]);
4316 zip16<1>(rows[55], rows[63], rows[55], rows[63]);
4317 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4318 zip16<1>(rows[8], rows[12], rows[8], rows[12]);
4319 zip16<1>(rows[16], rows[20], rows[16], rows[20]);
4320 zip16<1>(rows[24], rows[28], rows[24], rows[28]);
4321 zip16<1>(rows[32], rows[36], rows[32], rows[36]);
4322 zip16<1>(rows[40], rows[44], rows[40], rows[44]);
4323 zip16<1>(rows[48], rows[52], rows[48], rows[52]);
4324 zip16<1>(rows[56], rows[60], rows[56], rows[60]);
4325 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4326 zip16<1>(rows[9], rows[13], rows[9], rows[13]);
4327 zip16<1>(rows[17], rows[21], rows[17], rows[21]);
4328 zip16<1>(rows[25], rows[29], rows[25], rows[29]);
4329 zip16<1>(rows[33], rows[37], rows[33], rows[37]);
4330 zip16<1>(rows[41], rows[45], rows[41], rows[45]);
4331 zip16<1>(rows[49], rows[53], rows[49], rows[53]);
4332 zip16<1>(rows[57], rows[61], rows[57], rows[61]);
4333 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4334 zip16<1>(rows[10], rows[14], rows[10], rows[14]);
4335 zip16<1>(rows[18], rows[22], rows[18], rows[22]);
4336 zip16<1>(rows[26], rows[30], rows[26], rows[30]);
4337 zip16<1>(rows[34], rows[38], rows[34], rows[38]);
4338 zip16<1>(rows[42], rows[46], rows[42], rows[46]);
4339 zip16<1>(rows[50], rows[54], rows[50], rows[54]);
4340 zip16<1>(rows[58], rows[62], rows[58], rows[62]);
4341 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4342 zip16<1>(rows[11], rows[15], rows[11], rows[15]);
4343 zip16<1>(rows[19], rows[23], rows[19], rows[23]);
4344 zip16<1>(rows[27], rows[31], rows[27], rows[31]);
4345 zip16<1>(rows[35], rows[39], rows[35], rows[39]);
4346 zip16<1>(rows[43], rows[47], rows[43], rows[47]);
4347 zip16<1>(rows[51], rows[55], rows[51], rows[55]);
4348 zip16<1>(rows[59], rows[63], rows[59], rows[63]);
4349 // correction steps follow below (if required)
4350 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4351 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4352 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4353 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4354 zip<1>(rows[8], rows[9], rows[8], rows[9]);
4355 zip<1>(rows[10], rows[11], rows[10], rows[11]);
4356 zip<1>(rows[12], rows[13], rows[12], rows[13]);
4357 zip<1>(rows[14], rows[15], rows[14], rows[15]);
4358 zip<1>(rows[16], rows[17], rows[16], rows[17]);
4359 zip<1>(rows[18], rows[19], rows[18], rows[19]);
4360 zip<1>(rows[20], rows[21], rows[20], rows[21]);
4361 zip<1>(rows[22], rows[23], rows[22], rows[23]);
4362 zip<1>(rows[24], rows[25], rows[24], rows[25]);
4363 zip<1>(rows[26], rows[27], rows[26], rows[27]);
4364 zip<1>(rows[28], rows[29], rows[28], rows[29]);
4365 zip<1>(rows[30], rows[31], rows[30], rows[31]);
4366 zip<1>(rows[32], rows[33], rows[32], rows[33]);
4367 zip<1>(rows[34], rows[35], rows[34], rows[35]);
4368 zip<1>(rows[36], rows[37], rows[36], rows[37]);
4369 zip<1>(rows[38], rows[39], rows[38], rows[39]);
4370 zip<1>(rows[40], rows[41], rows[40], rows[41]);
4371 zip<1>(rows[42], rows[43], rows[42], rows[43]);
4372 zip<1>(rows[44], rows[45], rows[44], rows[45]);
4373 zip<1>(rows[46], rows[47], rows[46], rows[47]);
4374 zip<1>(rows[48], rows[49], rows[48], rows[49]);
4375 zip<1>(rows[50], rows[51], rows[50], rows[51]);
4376 zip<1>(rows[52], rows[53], rows[52], rows[53]);
4377 zip<1>(rows[54], rows[55], rows[54], rows[55]);
4378 zip<1>(rows[56], rows[57], rows[56], rows[57]);
4379 zip<1>(rows[58], rows[59], rows[58], rows[59]);
4380 zip<1>(rows[60], rows[61], rows[60], rows[61]);
4381 zip<1>(rows[62], rows[63], rows[62], rows[63]);
4382 zip<2>(rows[0], rows[2], rows[0], rows[2]);
4383 zip<2>(rows[4], rows[6], rows[4], rows[6]);
4384 zip<2>(rows[8], rows[10], rows[8], rows[10]);
4385 zip<2>(rows[12], rows[14], rows[12], rows[14]);
4386 zip<2>(rows[16], rows[18], rows[16], rows[18]);
4387 zip<2>(rows[20], rows[22], rows[20], rows[22]);
4388 zip<2>(rows[24], rows[26], rows[24], rows[26]);
4389 zip<2>(rows[28], rows[30], rows[28], rows[30]);
4390 zip<2>(rows[32], rows[34], rows[32], rows[34]);
4391 zip<2>(rows[36], rows[38], rows[36], rows[38]);
4392 zip<2>(rows[40], rows[42], rows[40], rows[42]);
4393 zip<2>(rows[44], rows[46], rows[44], rows[46]);
4394 zip<2>(rows[48], rows[50], rows[48], rows[50]);
4395 zip<2>(rows[52], rows[54], rows[52], rows[54]);
4396 zip<2>(rows[56], rows[58], rows[56], rows[58]);
4397 zip<2>(rows[60], rows[62], rows[60], rows[62]);
4398 zip<2>(rows[1], rows[3], rows[1], rows[3]);
4399 zip<2>(rows[5], rows[7], rows[5], rows[7]);
4400 zip<2>(rows[9], rows[11], rows[9], rows[11]);
4401 zip<2>(rows[13], rows[15], rows[13], rows[15]);
4402 zip<2>(rows[17], rows[19], rows[17], rows[19]);
4403 zip<2>(rows[21], rows[23], rows[21], rows[23]);
4404 zip<2>(rows[25], rows[27], rows[25], rows[27]);
4405 zip<2>(rows[29], rows[31], rows[29], rows[31]);
4406 zip<2>(rows[33], rows[35], rows[33], rows[35]);
4407 zip<2>(rows[37], rows[39], rows[37], rows[39]);
4408 zip<2>(rows[41], rows[43], rows[41], rows[43]);
4409 zip<2>(rows[45], rows[47], rows[45], rows[47]);
4410 zip<2>(rows[49], rows[51], rows[49], rows[51]);
4411 zip<2>(rows[53], rows[55], rows[53], rows[55]);
4412 zip<2>(rows[57], rows[59], rows[57], rows[59]);
4413 zip<2>(rows[61], rows[63], rows[61], rows[63]);
4414 {
4415 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4416 rows[1] = rows[4];
4417 rows[4] = rows[16];
4418 rows[16] = rows[2];
4419 rows[2] = rows[8];
4420 rows[8] = rows[32];
4421 rows[32] = vec_v;
4422 }
4423 {
4424 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4425 rows[3] = rows[12];
4426 rows[12] = rows[48];
4427 rows[48] = vec_v;
4428 }
4429 {
4430 Vec<T, SIMD_WIDTH> vec_v = rows[5];
4431 rows[5] = rows[20];
4432 rows[20] = rows[18];
4433 rows[18] = rows[10];
4434 rows[10] = rows[40];
4435 rows[40] = rows[33];
4436 rows[33] = vec_v;
4437 }
4438 {
4439 Vec<T, SIMD_WIDTH> vec_v = rows[6];
4440 rows[6] = rows[24];
4441 rows[24] = rows[34];
4442 rows[34] = rows[9];
4443 rows[9] = rows[36];
4444 rows[36] = rows[17];
4445 rows[17] = vec_v;
4446 }
4447 {
4448 Vec<T, SIMD_WIDTH> vec_v = rows[7];
4449 rows[7] = rows[28];
4450 rows[28] = rows[50];
4451 rows[50] = rows[11];
4452 rows[11] = rows[44];
4453 rows[44] = rows[49];
4454 rows[49] = vec_v;
4455 }
4456 {
4457 Vec<T, SIMD_WIDTH> vec_v = rows[13];
4458 rows[13] = rows[52];
4459 rows[52] = rows[19];
4460 rows[19] = rows[14];
4461 rows[14] = rows[56];
4462 rows[56] = rows[35];
4463 rows[35] = vec_v;
4464 }
4465 {
4466 Vec<T, SIMD_WIDTH> vec_v = rows[15];
4467 rows[15] = rows[60];
4468 rows[60] = rows[51];
4469 rows[51] = vec_v;
4470 }
4471 {
4472 Vec<T, SIMD_WIDTH> vec_v = rows[21];
4473 rows[21] = rows[22];
4474 rows[22] = rows[26];
4475 rows[26] = rows[42];
4476 rows[42] = rows[41];
4477 rows[41] = rows[37];
4478 rows[37] = vec_v;
4479 }
4480 {
4481 Vec<T, SIMD_WIDTH> vec_v = rows[23];
4482 rows[23] = rows[30];
4483 rows[30] = rows[58];
4484 rows[58] = rows[43];
4485 rows[43] = rows[45];
4486 rows[45] = rows[53];
4487 rows[53] = vec_v;
4488 }
4489 {
4490 Vec<T, SIMD_WIDTH> vec_v = rows[25];
4491 rows[25] = rows[38];
4492 rows[38] = vec_v;
4493 }
4494 {
4495 Vec<T, SIMD_WIDTH> vec_v = rows[27];
4496 rows[27] = rows[46];
4497 rows[46] = rows[57];
4498 rows[57] = rows[39];
4499 rows[39] = rows[29];
4500 rows[29] = rows[54];
4501 rows[54] = vec_v;
4502 }
4503 {
4504 Vec<T, SIMD_WIDTH> vec_v = rows[31];
4505 rows[31] = rows[62];
4506 rows[62] = rows[59];
4507 rows[59] = rows[47];
4508 rows[47] = rows[61];
4509 rows[61] = rows[55];
4510 rows[55] = vec_v;
4511 }
4512}
4513
4514template <typename T, size_t SIMD_WIDTH>
4515static SIMD_INLINE void transpose2inplcLane(
4516 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<32>, Bytes<64>)
4517{
4518 zip16<1>(rows[0], rows[16], rows[0], rows[16]);
4519 zip16<1>(rows[1], rows[17], rows[1], rows[17]);
4520 zip16<1>(rows[2], rows[18], rows[2], rows[18]);
4521 zip16<1>(rows[3], rows[19], rows[3], rows[19]);
4522 zip16<1>(rows[4], rows[20], rows[4], rows[20]);
4523 zip16<1>(rows[5], rows[21], rows[5], rows[21]);
4524 zip16<1>(rows[6], rows[22], rows[6], rows[22]);
4525 zip16<1>(rows[7], rows[23], rows[7], rows[23]);
4526 zip16<1>(rows[8], rows[24], rows[8], rows[24]);
4527 zip16<1>(rows[9], rows[25], rows[9], rows[25]);
4528 zip16<1>(rows[10], rows[26], rows[10], rows[26]);
4529 zip16<1>(rows[11], rows[27], rows[11], rows[27]);
4530 zip16<1>(rows[12], rows[28], rows[12], rows[28]);
4531 zip16<1>(rows[13], rows[29], rows[13], rows[29]);
4532 zip16<1>(rows[14], rows[30], rows[14], rows[30]);
4533 zip16<1>(rows[15], rows[31], rows[15], rows[31]);
4534 zip16<1>(rows[0], rows[8], rows[0], rows[8]);
4535 zip16<1>(rows[16], rows[24], rows[16], rows[24]);
4536 zip16<1>(rows[1], rows[9], rows[1], rows[9]);
4537 zip16<1>(rows[17], rows[25], rows[17], rows[25]);
4538 zip16<1>(rows[2], rows[10], rows[2], rows[10]);
4539 zip16<1>(rows[18], rows[26], rows[18], rows[26]);
4540 zip16<1>(rows[3], rows[11], rows[3], rows[11]);
4541 zip16<1>(rows[19], rows[27], rows[19], rows[27]);
4542 zip16<1>(rows[4], rows[12], rows[4], rows[12]);
4543 zip16<1>(rows[20], rows[28], rows[20], rows[28]);
4544 zip16<1>(rows[5], rows[13], rows[5], rows[13]);
4545 zip16<1>(rows[21], rows[29], rows[21], rows[29]);
4546 zip16<1>(rows[6], rows[14], rows[6], rows[14]);
4547 zip16<1>(rows[22], rows[30], rows[22], rows[30]);
4548 zip16<1>(rows[7], rows[15], rows[7], rows[15]);
4549 zip16<1>(rows[23], rows[31], rows[23], rows[31]);
4550 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4551 zip16<1>(rows[8], rows[12], rows[8], rows[12]);
4552 zip16<1>(rows[16], rows[20], rows[16], rows[20]);
4553 zip16<1>(rows[24], rows[28], rows[24], rows[28]);
4554 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4555 zip16<1>(rows[9], rows[13], rows[9], rows[13]);
4556 zip16<1>(rows[17], rows[21], rows[17], rows[21]);
4557 zip16<1>(rows[25], rows[29], rows[25], rows[29]);
4558 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4559 zip16<1>(rows[10], rows[14], rows[10], rows[14]);
4560 zip16<1>(rows[18], rows[22], rows[18], rows[22]);
4561 zip16<1>(rows[26], rows[30], rows[26], rows[30]);
4562 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4563 zip16<1>(rows[11], rows[15], rows[11], rows[15]);
4564 zip16<1>(rows[19], rows[23], rows[19], rows[23]);
4565 zip16<1>(rows[27], rows[31], rows[27], rows[31]);
4566 // correction steps follow below (if required)
4567 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4568 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4569 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4570 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4571 zip<1>(rows[8], rows[9], rows[8], rows[9]);
4572 zip<1>(rows[10], rows[11], rows[10], rows[11]);
4573 zip<1>(rows[12], rows[13], rows[12], rows[13]);
4574 zip<1>(rows[14], rows[15], rows[14], rows[15]);
4575 zip<1>(rows[16], rows[17], rows[16], rows[17]);
4576 zip<1>(rows[18], rows[19], rows[18], rows[19]);
4577 zip<1>(rows[20], rows[21], rows[20], rows[21]);
4578 zip<1>(rows[22], rows[23], rows[22], rows[23]);
4579 zip<1>(rows[24], rows[25], rows[24], rows[25]);
4580 zip<1>(rows[26], rows[27], rows[26], rows[27]);
4581 zip<1>(rows[28], rows[29], rows[28], rows[29]);
4582 zip<1>(rows[30], rows[31], rows[30], rows[31]);
4583 zip<2>(rows[0], rows[2], rows[0], rows[2]);
4584 zip<2>(rows[4], rows[6], rows[4], rows[6]);
4585 zip<2>(rows[8], rows[10], rows[8], rows[10]);
4586 zip<2>(rows[12], rows[14], rows[12], rows[14]);
4587 zip<2>(rows[16], rows[18], rows[16], rows[18]);
4588 zip<2>(rows[20], rows[22], rows[20], rows[22]);
4589 zip<2>(rows[24], rows[26], rows[24], rows[26]);
4590 zip<2>(rows[28], rows[30], rows[28], rows[30]);
4591 zip<2>(rows[1], rows[3], rows[1], rows[3]);
4592 zip<2>(rows[5], rows[7], rows[5], rows[7]);
4593 zip<2>(rows[9], rows[11], rows[9], rows[11]);
4594 zip<2>(rows[13], rows[15], rows[13], rows[15]);
4595 zip<2>(rows[17], rows[19], rows[17], rows[19]);
4596 zip<2>(rows[21], rows[23], rows[21], rows[23]);
4597 zip<2>(rows[25], rows[27], rows[25], rows[27]);
4598 zip<2>(rows[29], rows[31], rows[29], rows[31]);
4599 {
4600 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4601 rows[1] = rows[4];
4602 rows[4] = rows[16];
4603 rows[16] = vec_v;
4604 }
4605 {
4606 Vec<T, SIMD_WIDTH> vec_v = rows[2];
4607 rows[2] = rows[8];
4608 rows[8] = vec_v;
4609 }
4610 {
4611 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4612 rows[3] = rows[12];
4613 rows[12] = rows[18];
4614 rows[18] = rows[9];
4615 rows[9] = rows[6];
4616 rows[6] = rows[24];
4617 rows[24] = vec_v;
4618 }
4619 {
4620 Vec<T, SIMD_WIDTH> vec_v = rows[5];
4621 rows[5] = rows[20];
4622 rows[20] = rows[17];
4623 rows[17] = vec_v;
4624 }
4625 {
4626 Vec<T, SIMD_WIDTH> vec_v = rows[7];
4627 rows[7] = rows[28];
4628 rows[28] = rows[19];
4629 rows[19] = rows[13];
4630 rows[13] = rows[22];
4631 rows[22] = rows[25];
4632 rows[25] = vec_v;
4633 }
4634 {
4635 Vec<T, SIMD_WIDTH> vec_v = rows[11];
4636 rows[11] = rows[14];
4637 rows[14] = rows[26];
4638 rows[26] = vec_v;
4639 }
4640 {
4641 Vec<T, SIMD_WIDTH> vec_v = rows[15];
4642 rows[15] = rows[30];
4643 rows[30] = rows[27];
4644 rows[27] = vec_v;
4645 }
4646 {
4647 Vec<T, SIMD_WIDTH> vec_v = rows[23];
4648 rows[23] = rows[29];
4649 rows[29] = vec_v;
4650 }
4651}
4652
4653template <typename T, size_t SIMD_WIDTH>
4654static SIMD_INLINE void transpose2inplcLane(
4655 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<16>, Bytes<64>)
4656{
4657 zip16<1>(rows[0], rows[8], rows[0], rows[8]);
4658 zip16<1>(rows[1], rows[9], rows[1], rows[9]);
4659 zip16<1>(rows[2], rows[10], rows[2], rows[10]);
4660 zip16<1>(rows[3], rows[11], rows[3], rows[11]);
4661 zip16<1>(rows[4], rows[12], rows[4], rows[12]);
4662 zip16<1>(rows[5], rows[13], rows[5], rows[13]);
4663 zip16<1>(rows[6], rows[14], rows[6], rows[14]);
4664 zip16<1>(rows[7], rows[15], rows[7], rows[15]);
4665 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4666 zip16<1>(rows[8], rows[12], rows[8], rows[12]);
4667 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4668 zip16<1>(rows[9], rows[13], rows[9], rows[13]);
4669 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4670 zip16<1>(rows[10], rows[14], rows[10], rows[14]);
4671 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4672 zip16<1>(rows[11], rows[15], rows[11], rows[15]);
4673 // correction steps follow below (if required)
4674 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4675 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4676 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4677 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4678 zip<1>(rows[8], rows[9], rows[8], rows[9]);
4679 zip<1>(rows[10], rows[11], rows[10], rows[11]);
4680 zip<1>(rows[12], rows[13], rows[12], rows[13]);
4681 zip<1>(rows[14], rows[15], rows[14], rows[15]);
4682 zip<2>(rows[0], rows[2], rows[0], rows[2]);
4683 zip<2>(rows[4], rows[6], rows[4], rows[6]);
4684 zip<2>(rows[8], rows[10], rows[8], rows[10]);
4685 zip<2>(rows[12], rows[14], rows[12], rows[14]);
4686 zip<2>(rows[1], rows[3], rows[1], rows[3]);
4687 zip<2>(rows[5], rows[7], rows[5], rows[7]);
4688 zip<2>(rows[9], rows[11], rows[9], rows[11]);
4689 zip<2>(rows[13], rows[15], rows[13], rows[15]);
4690 {
4691 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4692 rows[1] = rows[4];
4693 rows[4] = rows[2];
4694 rows[2] = rows[8];
4695 rows[8] = vec_v;
4696 }
4697 {
4698 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4699 rows[3] = rows[12];
4700 rows[12] = vec_v;
4701 }
4702 {
4703 Vec<T, SIMD_WIDTH> vec_v = rows[5];
4704 rows[5] = rows[6];
4705 rows[6] = rows[10];
4706 rows[10] = rows[9];
4707 rows[9] = vec_v;
4708 }
4709 {
4710 Vec<T, SIMD_WIDTH> vec_v = rows[7];
4711 rows[7] = rows[14];
4712 rows[14] = rows[11];
4713 rows[11] = rows[13];
4714 rows[13] = vec_v;
4715 }
4716}
4717
4718template <typename T, size_t SIMD_WIDTH>
4719static SIMD_INLINE void transpose2inplcLane(
4720 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems], Elements<8>, Bytes<64>)
4721{
4722 zip16<1>(rows[0], rows[4], rows[0], rows[4]);
4723 zip16<1>(rows[1], rows[5], rows[1], rows[5]);
4724 zip16<1>(rows[2], rows[6], rows[2], rows[6]);
4725 zip16<1>(rows[3], rows[7], rows[3], rows[7]);
4726 // correction steps follow below (if required)
4727 zip<1>(rows[0], rows[1], rows[0], rows[1]);
4728 zip<1>(rows[2], rows[3], rows[2], rows[3]);
4729 zip<1>(rows[4], rows[5], rows[4], rows[5]);
4730 zip<1>(rows[6], rows[7], rows[6], rows[7]);
4731 zip<2>(rows[0], rows[2], rows[0], rows[2]);
4732 zip<2>(rows[4], rows[6], rows[4], rows[6]);
4733 zip<2>(rows[1], rows[3], rows[1], rows[3]);
4734 zip<2>(rows[5], rows[7], rows[5], rows[7]);
4735 {
4736 Vec<T, SIMD_WIDTH> vec_v = rows[1];
4737 rows[1] = rows[4];
4738 rows[4] = vec_v;
4739 }
4740 {
4741 Vec<T, SIMD_WIDTH> vec_v = rows[3];
4742 rows[3] = rows[6];
4743 rows[6] = vec_v;
4744 }
4745}
4746
4747template <typename T, size_t SIMD_WIDTH>
4748static SIMD_INLINE void transpose2inplcLane(
4749 Vec<T, SIMD_WIDTH> rows[Vec<T, SIMD_WIDTH>::elems])
4750{
4751 transpose2inplcLane(rows, Elements<Vec<T, SIMD_WIDTH>::elements>(),
4752 Bytes<SIMD_WIDTH>());
4753}
4754
4755} // namespace ext
4756} // namespace internal
4757} // namespace simd
4758
4759#endif // SIMD_VEC_EXT_TRANSPOSE_AUTOGEN_H_
static constexpr size_t elems
Number of elements in the vector. Alias for elements.
Definition vec.H:85
static constexpr size_t elements
Number of elements in the vector.
Definition vec.H:80
static void zip16(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Interleaves blocks of elements of each 16-byte lane of two Vec's.
Definition base.H:1286
static void zip(const Vec< T, SIMD_WIDTH > a, const Vec< T, SIMD_WIDTH > b, Vec< T, SIMD_WIDTH > &l, Vec< T, SIMD_WIDTH > &h)
Interleaves blocks of elements of two Vec's.
Definition base.H:1247
Namespace for T-SIMD.
Definition time_measurement.H:161