26#ifndef SSSE3_COMPAT_H_
27#define SSSE3_COMPAT_H_
30#include "intrins_intel.H"
68static inline __m128 _mm_hadd_ps(__m128 a, __m128 b)
70 float atmp[4] SIMD_ATTR_ALIGNED(16);
71 float btmp[4] SIMD_ATTR_ALIGNED(16);
72 float tmp[4] SIMD_ATTR_ALIGNED(16);
73 _mm_store_ps(atmp, a);
74 _mm_store_ps(btmp, b);
75 tmp[0] = atmp[1] + atmp[0];
76 tmp[1] = atmp[3] + atmp[2];
77 tmp[2] = btmp[1] + btmp[0];
78 tmp[3] = btmp[3] + btmp[2];
79 return _mm_load_ps(tmp);
82static inline __m128 _mm_hsub_ps(__m128 a, __m128 b)
84 float atmp[4] SIMD_ATTR_ALIGNED(16);
85 float btmp[4] SIMD_ATTR_ALIGNED(16);
86 float tmp[4] SIMD_ATTR_ALIGNED(16);
87 _mm_store_ps(atmp, a);
88 _mm_store_ps(btmp, b);
89 tmp[0] = atmp[0] - atmp[1];
90 tmp[1] = atmp[2] - atmp[3];
91 tmp[2] = btmp[0] - btmp[1];
92 tmp[3] = btmp[2] - btmp[3];
93 return _mm_load_ps(tmp);
106static inline __m128i _mm_abs_epi8(__m128i a)
108 int8_t tmp[16] SIMD_ATTR_ALIGNED(16);
109 _mm_store_si128((__m128i *) tmp, a);
110 for (
size_t i = 0; i < 16; i++) tmp[i] = std::abs(tmp[i]);
111 return _mm_load_si128((__m128i *) tmp);
114static inline __m128i _mm_abs_epi16(__m128i a)
116 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
117 _mm_store_si128((__m128i *) tmp, a);
118 for (
size_t i = 0; i < 8; i++) tmp[i] = std::abs(tmp[i]);
119 return _mm_load_si128((__m128i *) tmp);
122static inline __m128i _mm_abs_epi32(__m128i a)
124 int32_t tmp[4] SIMD_ATTR_ALIGNED(16);
125 _mm_store_si128((__m128i *) tmp, a);
126 for (
size_t i = 0; i < 4; i++) tmp[i] = std::abs(tmp[i]);
127 return _mm_load_si128((__m128i *) tmp);
137#ifdef _mm_alignr_epi8
138#undef _mm_alignr_epi8
142static inline __m128i _mm_alignr_epi8(__m128i a, __m128i b,
int n)
144 int8_t abtmp[32] SIMD_ATTR_ALIGNED(16);
145 int8_t rtmp[16] SIMD_ATTR_ALIGNED(16);
146 _mm_store_si128((__m128i *) abtmp, b);
147 _mm_store_si128((__m128i *) (abtmp + 16), a);
148 for (
size_t i = 0; i < 16; i++) {
149 const size_t j = i + n;
155 return _mm_load_si128((__m128i *) rtmp);
158static inline __m128i _mm_hadd_epi16(__m128i a, __m128i b)
160 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
161 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
162 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
163 _mm_store_si128((__m128i *) atmp, a);
164 _mm_store_si128((__m128i *) btmp, b);
165 tmp[0] = atmp[1] + atmp[0];
166 tmp[1] = atmp[3] + atmp[2];
167 tmp[2] = atmp[5] + atmp[4];
168 tmp[3] = atmp[7] + atmp[6];
169 tmp[4] = btmp[1] + btmp[0];
170 tmp[5] = btmp[3] + btmp[2];
171 tmp[6] = btmp[5] + btmp[4];
172 tmp[7] = btmp[7] + btmp[6];
173 return _mm_load_si128((__m128i *) tmp);
176static inline __m128i _mm_hsub_epi16(__m128i a, __m128i b)
178 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
179 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
180 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
181 _mm_store_si128((__m128i *) atmp, a);
182 _mm_store_si128((__m128i *) btmp, b);
183 tmp[0] = atmp[0] - atmp[1];
184 tmp[1] = atmp[2] - atmp[3];
185 tmp[2] = atmp[4] - atmp[5];
186 tmp[3] = atmp[6] - atmp[7];
187 tmp[4] = btmp[0] - btmp[1];
188 tmp[5] = btmp[2] - btmp[3];
189 tmp[6] = btmp[4] - btmp[5];
190 tmp[7] = btmp[6] - btmp[7];
191 return _mm_load_si128((__m128i *) tmp);
194static inline __m128i _mm_hadd_epi32(__m128i a, __m128i b)
196 int32_t atmp[4] SIMD_ATTR_ALIGNED(16);
197 int32_t btmp[4] SIMD_ATTR_ALIGNED(16);
198 int32_t tmp[4] SIMD_ATTR_ALIGNED(16);
199 _mm_store_si128((__m128i *) atmp, a);
200 _mm_store_si128((__m128i *) btmp, b);
201 tmp[0] = atmp[1] + atmp[0];
202 tmp[1] = atmp[3] + atmp[2];
203 tmp[2] = btmp[1] + btmp[0];
204 tmp[3] = btmp[3] + btmp[2];
205 return _mm_load_si128((__m128i *) tmp);
208static inline __m128i _mm_hsub_epi32(__m128i a, __m128i b)
210 int32_t atmp[4] SIMD_ATTR_ALIGNED(16);
211 int32_t btmp[4] SIMD_ATTR_ALIGNED(16);
212 int32_t tmp[4] SIMD_ATTR_ALIGNED(16);
213 _mm_store_si128((__m128i *) atmp, a);
214 _mm_store_si128((__m128i *) btmp, b);
215 tmp[0] = atmp[0] - atmp[1];
216 tmp[1] = atmp[2] - atmp[3];
217 tmp[2] = btmp[0] - btmp[1];
218 tmp[3] = btmp[2] - btmp[3];
219 return _mm_load_si128((__m128i *) tmp);
222static inline int16_t adds16(int16_t a, int16_t b)
224 int32_t s = int32_t(a) + int32_t(b);
225 return (s < -0x8000) ? -0x8000 : (s > 0x7fff) ? 0x7fff : s;
228static inline __m128i _mm_hadds_epi16(__m128i a, __m128i b)
230 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
231 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
232 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
233 _mm_store_si128((__m128i *) atmp, a);
234 _mm_store_si128((__m128i *) btmp, b);
235 tmp[0] = adds16(atmp[1], atmp[0]);
236 tmp[1] = adds16(atmp[3], atmp[2]);
237 tmp[2] = adds16(atmp[5], atmp[4]);
238 tmp[3] = adds16(atmp[7], atmp[6]);
239 tmp[4] = adds16(btmp[1], btmp[0]);
240 tmp[5] = adds16(btmp[3], btmp[2]);
241 tmp[6] = adds16(btmp[5], btmp[4]);
242 tmp[7] = adds16(btmp[7], btmp[6]);
243 return _mm_load_si128((__m128i *) tmp);
246static inline int16_t subs16(int16_t a, int16_t b)
248 int32_t s = int32_t(a) - int32_t(b);
249 return (s < -0x8000) ? -0x8000 : (s > 0x7fff) ? 0x7fff : s;
253static inline __m128i _mm_hsubs_epi16(__m128i a, __m128i b)
255 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
256 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
257 int16_t tmp[8] SIMD_ATTR_ALIGNED(16);
258 _mm_store_si128((__m128i *) atmp, a);
259 _mm_store_si128((__m128i *) btmp, b);
260 tmp[0] = subs16(atmp[0], atmp[1]);
261 tmp[1] = subs16(atmp[2], atmp[3]);
262 tmp[2] = subs16(atmp[4], atmp[5]);
263 tmp[3] = subs16(atmp[6], atmp[7]);
264 tmp[4] = subs16(btmp[0], btmp[1]);
265 tmp[5] = subs16(btmp[2], btmp[3]);
266 tmp[6] = subs16(btmp[4], btmp[5]);
267 tmp[7] = subs16(btmp[6], btmp[7]);
268 return _mm_load_si128((__m128i *) tmp);
271static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i mask)
273 uint8_t atmp[16] SIMD_ATTR_ALIGNED(16);
274 uint8_t masktmp[16] SIMD_ATTR_ALIGNED(16);
275 uint8_t rtmp[16] SIMD_ATTR_ALIGNED(16);
276 _mm_store_si128((__m128i *) atmp, a);
277 _mm_store_si128((__m128i *) masktmp, mask);
278 for (
size_t i = 0; i < 16; i++)
279 rtmp[i] = (masktmp[i] & 0x80) ? 0 : atmp[masktmp[i] & 0x0f];
280 return _mm_load_si128((__m128i *) rtmp);
284static inline __m128i _mm_sign_epi16(__m128i a, __m128i b)
286 int16_t atmp[8] SIMD_ATTR_ALIGNED(16);
287 int16_t btmp[8] SIMD_ATTR_ALIGNED(16);
288 _mm_store_si128((__m128i *) atmp, a);
289 _mm_store_si128((__m128i *) btmp, b);
290 for (
size_t i = 0; i < 8; i++)
291 if (btmp[i] < 0) atmp[i] = -atmp[i];
292 return _mm_load_si128((__m128i *) atmp);
Namespace for T-SIMD.
Definition time_measurement.H:161