T-SIMD v31.1.0
A C++ template SIMD library
Loading...
Searching...
No Matches
SSSE3_compat.H
1// ===========================================================================
2//
3// compatibility code for CPUs without SSE3 or SSSE3
4//
5// This source code file is part of the following software:
6//
7// - the low-level C++ template SIMD library
8// - the SIMD implementation of the MinWarping and the 2D-Warping methods
9// for local visual homing.
10//
11// The software is provided based on the accompanying license agreement in the
12// file LICENSE.md.
13// The software is provided "as is" without any warranty by the licensor and
14// without any liability of the licensor, and the software may not be
15// distributed by the licensee; see the license agreement for details.
16//
17// (C) Ralf Möller
18// Computer Engineering
19// Faculty of Technology
20// Bielefeld University
21// www.ti.uni-bielefeld.de
22//
23// ===========================================================================
24
25#pragma once
26#ifndef SSSE3_COMPAT_H_
27#define SSSE3_COMPAT_H_
28
29#include "../defs.H"
30#include "intrins_intel.H"
31
32#include <cmath>
33#include <cstddef>
34#include <cstdint>
35
36// SSE3/SSSE3 emulation for very old CPUs (very inefficient sequential code!)
37
38// replacement for SSE3 instructions
39// _mm_hadd_ps
40// _mm_hsub_ps
41//
42// replacement for SSSE3 instructions
43// _mm_abs_epi8
44// _mm_abs_epi16
45// _mm_abs_epi32
46// _mm_alignr_epi8
47// _mm_hadd_epi16
48// _mm_hadd_epi32
49// _mm_hadds_epi16
50// _mm_hsub_epi16
51// _mm_hsub_epi32
52// _mm_hsubs_epi16
53// _mm_shuffle_epi8
54// _mm_sign_epi16
55
56#ifdef __SSE2__
57
58namespace simd {
59
60// ===========================================================================
61// SSE3 replacements
62// ===========================================================================
63
64#ifndef __SSE3__
65
66// #warning "SSE3 intrinsics are replaced by slow sequential implementations"
67
68static inline __m128 _mm_hadd_ps(__m128 a, __m128 b)
69{
70 float atmp[4] SIMD_ATTR_ALIGNED(16);
71 float btmp[4] SIMD_ATTR_ALIGNED(16);
72 float tmp[4] SIMD_ATTR_ALIGNED(16);
73 _mm_store_ps(atmp, a);
74 _mm_store_ps(btmp, b);
75 tmp[0] = atmp[1] + atmp[0];
76 tmp[1] = atmp[3] + atmp[2];
77 tmp[2] = btmp[1] + btmp[0];
78 tmp[3] = btmp[3] + btmp[2];
79 return _mm_load_ps(tmp);
80}
81
82static inline __m128 _mm_hsub_ps(__m128 a, __m128 b)
83{
84 float atmp[4] SIMD_ATTR_ALIGNED(16);
85 float btmp[4] SIMD_ATTR_ALIGNED(16);
86 float tmp[4] SIMD_ATTR_ALIGNED(16);
87 _mm_store_ps(atmp, a);
88 _mm_store_ps(btmp, b);
89 tmp[0] = atmp[0] - atmp[1];
90 tmp[1] = atmp[2] - atmp[3];
91 tmp[2] = btmp[0] - btmp[1];
92 tmp[3] = btmp[2] - btmp[3];
93 return _mm_load_ps(tmp);
94}
95
96#endif
97
98// ===========================================================================
99// SSE3 replacements
100// ===========================================================================
101
102#ifndef __SSSE3__
103
104// #warning "SSSE3 intrinsics are replaced by slow sequential implementations"
105
106static inline __m128i _mm_abs_epi8(__m128i a)
107{
108 int8_t tmp[16] SIMD_ATTR_ALIGNED(16);
109 _mm_store_si128((__m128i *) tmp, a);
110 for (size_t i = 0; i < 16; i++) tmp[i] = std::abs(tmp[i]);
111 return _mm_load_si128((__m128i *) tmp);
112}
113
114static inline __m128i _mm_abs_epi16(__m128i a)
115{
116 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
117 _mm_store_si128((__m128i *) tmp, a);
118 for (size_t i = 0; i < 8; i++) tmp[i] = std::abs(tmp[i]);
119 return _mm_load_si128((__m128i *) tmp);
120}
121
122static inline __m128i _mm_abs_epi32(__m128i a)
123{
124 int32_t tmp[4] SIMD_ATTR_ALIGNED(16);
125 _mm_store_si128((__m128i *) tmp, a);
126 for (size_t i = 0; i < 4; i++) tmp[i] = std::abs(tmp[i]);
127 return _mm_load_si128((__m128i *) tmp);
128}
129
130// 24. Jul 18 (rm): strange, definitions from tmmintrin.h are
131// included even if ssse3 is not available;
132// gcc: for undefined __OPTIMIZE__, the macro _mm_alignr_epi8
133// clashes with the definition below;
134// clang: _mm_alignr_epi8 always defined as macro, regardless of
135// __OPTIMIZE__; also clashes with the definition below
136// solution: undefine the macro if it is defined
137#ifdef _mm_alignr_epi8
138#undef _mm_alignr_epi8
139#endif
140
141// 23. Sep 15 (rm): fixed several bugs
142static inline __m128i _mm_alignr_epi8(__m128i a, __m128i b, int n)
143{
144 int8_t abtmp[32] SIMD_ATTR_ALIGNED(16);
145 int8_t rtmp[16] SIMD_ATTR_ALIGNED(16);
146 _mm_store_si128((__m128i *) abtmp, b);
147 _mm_store_si128((__m128i *) (abtmp + 16), a);
148 for (size_t i = 0; i < 16; i++) {
149 const size_t j = i + n;
150 if (j < 32)
151 rtmp[i] = abtmp[j];
152 else
153 rtmp[i] = 0;
154 }
155 return _mm_load_si128((__m128i *) rtmp);
156}
157
158static inline __m128i _mm_hadd_epi16(__m128i a, __m128i b)
159{
160 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
161 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
162 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
163 _mm_store_si128((__m128i *) atmp, a);
164 _mm_store_si128((__m128i *) btmp, b);
165 tmp[0] = atmp[1] + atmp[0];
166 tmp[1] = atmp[3] + atmp[2];
167 tmp[2] = atmp[5] + atmp[4];
168 tmp[3] = atmp[7] + atmp[6];
169 tmp[4] = btmp[1] + btmp[0];
170 tmp[5] = btmp[3] + btmp[2];
171 tmp[6] = btmp[5] + btmp[4];
172 tmp[7] = btmp[7] + btmp[6];
173 return _mm_load_si128((__m128i *) tmp);
174}
175
176static inline __m128i _mm_hsub_epi16(__m128i a, __m128i b)
177{
178 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
179 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
180 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
181 _mm_store_si128((__m128i *) atmp, a);
182 _mm_store_si128((__m128i *) btmp, b);
183 tmp[0] = atmp[0] - atmp[1];
184 tmp[1] = atmp[2] - atmp[3];
185 tmp[2] = atmp[4] - atmp[5];
186 tmp[3] = atmp[6] - atmp[7];
187 tmp[4] = btmp[0] - btmp[1];
188 tmp[5] = btmp[2] - btmp[3];
189 tmp[6] = btmp[4] - btmp[5];
190 tmp[7] = btmp[6] - btmp[7];
191 return _mm_load_si128((__m128i *) tmp);
192}
193
194static inline __m128i _mm_hadd_epi32(__m128i a, __m128i b)
195{
196 int32_t atmp[4] SIMD_ATTR_ALIGNED(16);
197 int32_t btmp[4] SIMD_ATTR_ALIGNED(16);
198 int32_t tmp[4] SIMD_ATTR_ALIGNED(16);
199 _mm_store_si128((__m128i *) atmp, a);
200 _mm_store_si128((__m128i *) btmp, b);
201 tmp[0] = atmp[1] + atmp[0];
202 tmp[1] = atmp[3] + atmp[2];
203 tmp[2] = btmp[1] + btmp[0];
204 tmp[3] = btmp[3] + btmp[2];
205 return _mm_load_si128((__m128i *) tmp);
206}
207
208static inline __m128i _mm_hsub_epi32(__m128i a, __m128i b)
209{
210 int32_t atmp[4] SIMD_ATTR_ALIGNED(16);
211 int32_t btmp[4] SIMD_ATTR_ALIGNED(16);
212 int32_t tmp[4] SIMD_ATTR_ALIGNED(16);
213 _mm_store_si128((__m128i *) atmp, a);
214 _mm_store_si128((__m128i *) btmp, b);
215 tmp[0] = atmp[0] - atmp[1];
216 tmp[1] = atmp[2] - atmp[3];
217 tmp[2] = btmp[0] - btmp[1];
218 tmp[3] = btmp[2] - btmp[3];
219 return _mm_load_si128((__m128i *) tmp);
220}
221
222static inline int16_t adds16(int16_t a, int16_t b)
223{
224 int32_t s = int32_t(a) + int32_t(b);
225 return (s < -0x8000) ? -0x8000 : (s > 0x7fff) ? 0x7fff : s;
226}
227
228static inline __m128i _mm_hadds_epi16(__m128i a, __m128i b)
229{
230 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
231 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
232 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
233 _mm_store_si128((__m128i *) atmp, a);
234 _mm_store_si128((__m128i *) btmp, b);
235 tmp[0] = adds16(atmp[1], atmp[0]);
236 tmp[1] = adds16(atmp[3], atmp[2]);
237 tmp[2] = adds16(atmp[5], atmp[4]);
238 tmp[3] = adds16(atmp[7], atmp[6]);
239 tmp[4] = adds16(btmp[1], btmp[0]);
240 tmp[5] = adds16(btmp[3], btmp[2]);
241 tmp[6] = adds16(btmp[5], btmp[4]);
242 tmp[7] = adds16(btmp[7], btmp[6]);
243 return _mm_load_si128((__m128i *) tmp);
244}
245
246static inline int16_t subs16(int16_t a, int16_t b)
247{
248 int32_t s = int32_t(a) - int32_t(b);
249 return (s < -0x8000) ? -0x8000 : (s > 0x7fff) ? 0x7fff : s;
250}
251
252// 12. Aug 16 (rm): fixed bug: adds16->subs16
253static inline __m128i _mm_hsubs_epi16(__m128i a, __m128i b)
254{
255 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
256 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
257 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
258 _mm_store_si128((__m128i *) atmp, a);
259 _mm_store_si128((__m128i *) btmp, b);
260 tmp[0] = subs16(atmp[0], atmp[1]);
261 tmp[1] = subs16(atmp[2], atmp[3]);
262 tmp[2] = subs16(atmp[4], atmp[5]);
263 tmp[3] = subs16(atmp[6], atmp[7]);
264 tmp[4] = subs16(btmp[0], btmp[1]);
265 tmp[5] = subs16(btmp[2], btmp[3]);
266 tmp[6] = subs16(btmp[4], btmp[5]);
267 tmp[7] = subs16(btmp[6], btmp[7]);
268 return _mm_load_si128((__m128i *) tmp);
269}
270
271static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i mask)
272{
273 uint8_t atmp[16] SIMD_ATTR_ALIGNED(16);
274 uint8_t masktmp[16] SIMD_ATTR_ALIGNED(16);
275 uint8_t rtmp[16] SIMD_ATTR_ALIGNED(16);
276 _mm_store_si128((__m128i *) atmp, a);
277 _mm_store_si128((__m128i *) masktmp, mask);
278 for (size_t i = 0; i < 16; i++)
279 rtmp[i] = (masktmp[i] & 0x80) ? 0 : atmp[masktmp[i] & 0x0f];
280 return _mm_load_si128((__m128i *) rtmp);
281}
282
283// 1. Oct 14 (rm): added
284static inline __m128i _mm_sign_epi16(__m128i a, __m128i b)
285{
286 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
287 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
288 _mm_store_si128((__m128i *) atmp, a);
289 _mm_store_si128((__m128i *) btmp, b);
290 for (size_t i = 0; i < 8; i++)
291 if (btmp[i] < 0) atmp[i] = -atmp[i];
292 return _mm_load_si128((__m128i *) atmp);
293}
294
295#endif
296
297} // namespace simd
298
299#endif // __SSE2__
300
301#endif // SSSE3_COMPAT_H_
Namespace for T-SIMD.
Definition time_measurement.H:161