smmintrin.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. /* Copyright (C) 2007-2019 Free Software Foundation, Inc.
  2. This file is part of GCC.
  3. GCC is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 3, or (at your option)
  6. any later version.
  7. GCC is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. Under Section 7 of GPL version 3, you are granted additional
  12. permissions described in the GCC Runtime Library Exception, version
  13. 3.1, as published by the Free Software Foundation.
  14. You should have received a copy of the GNU General Public License and
  15. a copy of the GCC Runtime Library Exception along with this program;
  16. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  17. <http://www.gnu.org/licenses/>. */
  18. /* Implemented from the specification included in the Intel C++ Compiler
  19. User Guide and Reference, version 10.0. */
  20. #ifndef _SMMINTRIN_H_INCLUDED
  21. #define _SMMINTRIN_H_INCLUDED
  22. /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
  23. files. */
  24. #include <tmmintrin.h>
  25. #ifndef __SSE4_1__
  26. #pragma GCC push_options
  27. #pragma GCC target("sse4.1")
  28. #define __DISABLE_SSE4_1__
  29. #endif /* __SSE4_1__ */
  30. /* Rounding mode macros. */
  31. #define _MM_FROUND_TO_NEAREST_INT 0x00
  32. #define _MM_FROUND_TO_NEG_INF 0x01
  33. #define _MM_FROUND_TO_POS_INF 0x02
  34. #define _MM_FROUND_TO_ZERO 0x03
  35. #define _MM_FROUND_CUR_DIRECTION 0x04
  36. #define _MM_FROUND_RAISE_EXC 0x00
  37. #define _MM_FROUND_NO_EXC 0x08
  38. #define _MM_FROUND_NINT \
  39. (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
  40. #define _MM_FROUND_FLOOR \
  41. (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
  42. #define _MM_FROUND_CEIL \
  43. (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
  44. #define _MM_FROUND_TRUNC \
  45. (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
  46. #define _MM_FROUND_RINT \
  47. (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
  48. #define _MM_FROUND_NEARBYINT \
  49. (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
  50. /* Test Instruction */
  51. /* Packed integer 128-bit bitwise comparison. Return 1 if
  52. (__V & __M) == 0. */
  53. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  54. _mm_testz_si128 (__m128i __M, __m128i __V)
  55. {
  56. return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
  57. }
  58. /* Packed integer 128-bit bitwise comparison. Return 1 if
  59. (__V & ~__M) == 0. */
  60. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  61. _mm_testc_si128 (__m128i __M, __m128i __V)
  62. {
  63. return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
  64. }
  65. /* Packed integer 128-bit bitwise comparison. Return 1 if
  66. (__V & __M) != 0 && (__V & ~__M) != 0. */
  67. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  68. _mm_testnzc_si128 (__m128i __M, __m128i __V)
  69. {
  70. return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
  71. }
  72. /* Macros for packed integer 128-bit comparison intrinsics. */
  73. #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
  74. #define _mm_test_all_ones(V) \
  75. _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
  76. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
  77. /* Packed/scalar double precision floating point rounding. */
  78. #ifdef __OPTIMIZE__
  79. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  80. _mm_round_pd (__m128d __V, const int __M)
  81. {
  82. return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
  83. }
  84. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  85. _mm_round_sd(__m128d __D, __m128d __V, const int __M)
  86. {
  87. return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
  88. (__v2df)__V,
  89. __M);
  90. }
  91. #else
  92. #define _mm_round_pd(V, M) \
  93. ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
  94. #define _mm_round_sd(D, V, M) \
  95. ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \
  96. (__v2df)(__m128d)(V), (int)(M)))
  97. #endif
  98. /* Packed/scalar single precision floating point rounding. */
  99. #ifdef __OPTIMIZE__
  100. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  101. _mm_round_ps (__m128 __V, const int __M)
  102. {
  103. return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
  104. }
  105. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  106. _mm_round_ss (__m128 __D, __m128 __V, const int __M)
  107. {
  108. return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
  109. (__v4sf)__V,
  110. __M);
  111. }
  112. #else
  113. #define _mm_round_ps(V, M) \
  114. ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
  115. #define _mm_round_ss(D, V, M) \
  116. ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \
  117. (__v4sf)(__m128)(V), (int)(M)))
  118. #endif
  119. /* Macros for ceil/floor intrinsics. */
  120. #define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
  121. #define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
  122. #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
  123. #define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
  124. #define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
  125. #define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
  126. #define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
  127. #define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
  128. /* SSE4.1 */
  129. /* Integer blend instructions - select data from 2 sources using
  130. constant/variable mask. */
  131. #ifdef __OPTIMIZE__
  132. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  133. _mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
  134. {
  135. return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
  136. (__v8hi)__Y,
  137. __M);
  138. }
  139. #else
  140. #define _mm_blend_epi16(X, Y, M) \
  141. ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \
  142. (__v8hi)(__m128i)(Y), (int)(M)))
  143. #endif
  144. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  145. _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
  146. {
  147. return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
  148. (__v16qi)__Y,
  149. (__v16qi)__M);
  150. }
  151. /* Single precision floating point blend instructions - select data
  152. from 2 sources using constant/variable mask. */
  153. #ifdef __OPTIMIZE__
  154. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  155. _mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
  156. {
  157. return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
  158. (__v4sf)__Y,
  159. __M);
  160. }
  161. #else
  162. #define _mm_blend_ps(X, Y, M) \
  163. ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \
  164. (__v4sf)(__m128)(Y), (int)(M)))
  165. #endif
  166. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  167. _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
  168. {
  169. return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
  170. (__v4sf)__Y,
  171. (__v4sf)__M);
  172. }
  173. /* Double precision floating point blend instructions - select data
  174. from 2 sources using constant/variable mask. */
  175. #ifdef __OPTIMIZE__
  176. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  177. _mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
  178. {
  179. return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
  180. (__v2df)__Y,
  181. __M);
  182. }
  183. #else
  184. #define _mm_blend_pd(X, Y, M) \
  185. ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \
  186. (__v2df)(__m128d)(Y), (int)(M)))
  187. #endif
  188. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  189. _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
  190. {
  191. return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
  192. (__v2df)__Y,
  193. (__v2df)__M);
  194. }
  195. /* Dot product instructions with mask-defined summing and zeroing parts
  196. of result. */
  197. #ifdef __OPTIMIZE__
  198. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  199. _mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
  200. {
  201. return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
  202. (__v4sf)__Y,
  203. __M);
  204. }
  205. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  206. _mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
  207. {
  208. return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
  209. (__v2df)__Y,
  210. __M);
  211. }
  212. #else
  213. #define _mm_dp_ps(X, Y, M) \
  214. ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \
  215. (__v4sf)(__m128)(Y), (int)(M)))
  216. #define _mm_dp_pd(X, Y, M) \
  217. ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \
  218. (__v2df)(__m128d)(Y), (int)(M)))
  219. #endif
  220. /* Packed integer 64-bit comparison, zeroing or filling with ones
  221. corresponding parts of result. */
  222. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  223. _mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
  224. {
  225. return (__m128i) ((__v2di)__X == (__v2di)__Y);
  226. }
  227. /* Min/max packed integer instructions. */
  228. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  229. _mm_min_epi8 (__m128i __X, __m128i __Y)
  230. {
  231. return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
  232. }
  233. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  234. _mm_max_epi8 (__m128i __X, __m128i __Y)
  235. {
  236. return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
  237. }
  238. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  239. _mm_min_epu16 (__m128i __X, __m128i __Y)
  240. {
  241. return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
  242. }
  243. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  244. _mm_max_epu16 (__m128i __X, __m128i __Y)
  245. {
  246. return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
  247. }
  248. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  249. _mm_min_epi32 (__m128i __X, __m128i __Y)
  250. {
  251. return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
  252. }
  253. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  254. _mm_max_epi32 (__m128i __X, __m128i __Y)
  255. {
  256. return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
  257. }
  258. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  259. _mm_min_epu32 (__m128i __X, __m128i __Y)
  260. {
  261. return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
  262. }
  263. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  264. _mm_max_epu32 (__m128i __X, __m128i __Y)
  265. {
  266. return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
  267. }
  268. /* Packed integer 32-bit multiplication with truncation of upper
  269. halves of results. */
  270. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  271. _mm_mullo_epi32 (__m128i __X, __m128i __Y)
  272. {
  273. return (__m128i) ((__v4su)__X * (__v4su)__Y);
  274. }
  275. /* Packed integer 32-bit multiplication of 2 pairs of operands
  276. with two 64-bit results. */
  277. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  278. _mm_mul_epi32 (__m128i __X, __m128i __Y)
  279. {
  280. return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
  281. }
  282. /* Insert single precision float into packed single precision array
  283. element selected by index N. The bits [7-6] of N define S
  284. index, the bits [5-4] define D index, and bits [3-0] define
  285. zeroing mask for D. */
  286. #ifdef __OPTIMIZE__
  287. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  288. _mm_insert_ps (__m128 __D, __m128 __S, const int __N)
  289. {
  290. return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
  291. (__v4sf)__S,
  292. __N);
  293. }
  294. #else
  295. #define _mm_insert_ps(D, S, N) \
  296. ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \
  297. (__v4sf)(__m128)(S), (int)(N)))
  298. #endif
  299. /* Helper macro to create the N value for _mm_insert_ps. */
  300. #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
  301. /* Extract binary representation of single precision float from packed
  302. single precision array element of X selected by index N. */
  303. #ifdef __OPTIMIZE__
  304. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  305. _mm_extract_ps (__m128 __X, const int __N)
  306. {
  307. union { int i; float f; } __tmp;
  308. __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
  309. return __tmp.i;
  310. }
  311. #else
  312. #define _mm_extract_ps(X, N) \
  313. (__extension__ \
  314. ({ \
  315. union { int i; float f; } __tmp; \
  316. __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \
  317. __tmp.i; \
  318. }))
  319. #endif
  320. /* Extract binary representation of single precision float into
  321. D from packed single precision array element of S selected
  322. by index N. */
  323. #define _MM_EXTRACT_FLOAT(D, S, N) \
  324. { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
  325. /* Extract specified single precision float element into the lower
  326. part of __m128. */
  327. #define _MM_PICK_OUT_PS(X, N) \
  328. _mm_insert_ps (_mm_setzero_ps (), (X), \
  329. _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
  330. /* Insert integer, S, into packed integer array element of D
  331. selected by index N. */
  332. #ifdef __OPTIMIZE__
  333. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  334. _mm_insert_epi8 (__m128i __D, int __S, const int __N)
  335. {
  336. return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
  337. __S, __N);
  338. }
  339. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  340. _mm_insert_epi32 (__m128i __D, int __S, const int __N)
  341. {
  342. return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
  343. __S, __N);
  344. }
  345. #ifdef __x86_64__
  346. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  347. _mm_insert_epi64 (__m128i __D, long long __S, const int __N)
  348. {
  349. return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
  350. __S, __N);
  351. }
  352. #endif
  353. #else
  354. #define _mm_insert_epi8(D, S, N) \
  355. ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \
  356. (int)(S), (int)(N)))
  357. #define _mm_insert_epi32(D, S, N) \
  358. ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \
  359. (int)(S), (int)(N)))
  360. #ifdef __x86_64__
  361. #define _mm_insert_epi64(D, S, N) \
  362. ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \
  363. (long long)(S), (int)(N)))
  364. #endif
  365. #endif
  366. /* Extract integer from packed integer array element of X selected by
  367. index N. */
  368. #ifdef __OPTIMIZE__
  369. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  370. _mm_extract_epi8 (__m128i __X, const int __N)
  371. {
  372. return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
  373. }
  374. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  375. _mm_extract_epi32 (__m128i __X, const int __N)
  376. {
  377. return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
  378. }
  379. #ifdef __x86_64__
  380. extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  381. _mm_extract_epi64 (__m128i __X, const int __N)
  382. {
  383. return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
  384. }
  385. #endif
  386. #else
  387. #define _mm_extract_epi8(X, N) \
  388. ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
  389. #define _mm_extract_epi32(X, N) \
  390. ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
  391. #ifdef __x86_64__
  392. #define _mm_extract_epi64(X, N) \
  393. ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
  394. #endif
  395. #endif
  396. /* Return horizontal packed word minimum and its index in bits [15:0]
  397. and bits [18:16] respectively. */
  398. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  399. _mm_minpos_epu16 (__m128i __X)
  400. {
  401. return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
  402. }
  403. /* Packed integer sign-extension. */
  404. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  405. _mm_cvtepi8_epi32 (__m128i __X)
  406. {
  407. return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
  408. }
  409. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  410. _mm_cvtepi16_epi32 (__m128i __X)
  411. {
  412. return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
  413. }
  414. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  415. _mm_cvtepi8_epi64 (__m128i __X)
  416. {
  417. return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
  418. }
  419. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  420. _mm_cvtepi32_epi64 (__m128i __X)
  421. {
  422. return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
  423. }
  424. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  425. _mm_cvtepi16_epi64 (__m128i __X)
  426. {
  427. return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
  428. }
  429. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  430. _mm_cvtepi8_epi16 (__m128i __X)
  431. {
  432. return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
  433. }
  434. /* Packed integer zero-extension. */
  435. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  436. _mm_cvtepu8_epi32 (__m128i __X)
  437. {
  438. return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
  439. }
  440. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  441. _mm_cvtepu16_epi32 (__m128i __X)
  442. {
  443. return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
  444. }
  445. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  446. _mm_cvtepu8_epi64 (__m128i __X)
  447. {
  448. return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
  449. }
  450. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  451. _mm_cvtepu32_epi64 (__m128i __X)
  452. {
  453. return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
  454. }
  455. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  456. _mm_cvtepu16_epi64 (__m128i __X)
  457. {
  458. return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
  459. }
  460. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  461. _mm_cvtepu8_epi16 (__m128i __X)
  462. {
  463. return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
  464. }
  465. /* Pack 8 double words from 2 operands into 8 words of result with
  466. unsigned saturation. */
  467. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  468. _mm_packus_epi32 (__m128i __X, __m128i __Y)
  469. {
  470. return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
  471. }
  472. /* Sum absolute 8-bit integer difference of adjacent groups of 4
  473. byte integers in the first 2 operands. Starting offsets within
  474. operands are determined by the 3rd mask operand. */
  475. #ifdef __OPTIMIZE__
  476. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  477. _mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
  478. {
  479. return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
  480. (__v16qi)__Y, __M);
  481. }
  482. #else
  483. #define _mm_mpsadbw_epu8(X, Y, M) \
  484. ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \
  485. (__v16qi)(__m128i)(Y), (int)(M)))
  486. #endif
  487. /* Load double quadword using non-temporal aligned hint. */
  488. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  489. _mm_stream_load_si128 (__m128i *__X)
  490. {
  491. return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
  492. }
  493. #ifndef __SSE4_2__
  494. #pragma GCC push_options
  495. #pragma GCC target("sse4.2")
  496. #define __DISABLE_SSE4_2__
  497. #endif /* __SSE4_2__ */
  498. /* These macros specify the source data format. */
  499. #define _SIDD_UBYTE_OPS 0x00
  500. #define _SIDD_UWORD_OPS 0x01
  501. #define _SIDD_SBYTE_OPS 0x02
  502. #define _SIDD_SWORD_OPS 0x03
  503. /* These macros specify the comparison operation. */
  504. #define _SIDD_CMP_EQUAL_ANY 0x00
  505. #define _SIDD_CMP_RANGES 0x04
  506. #define _SIDD_CMP_EQUAL_EACH 0x08
  507. #define _SIDD_CMP_EQUAL_ORDERED 0x0c
  508. /* These macros specify the polarity. */
  509. #define _SIDD_POSITIVE_POLARITY 0x00
  510. #define _SIDD_NEGATIVE_POLARITY 0x10
  511. #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
  512. #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
  513. /* These macros specify the output selection in _mm_cmpXstri (). */
  514. #define _SIDD_LEAST_SIGNIFICANT 0x00
  515. #define _SIDD_MOST_SIGNIFICANT 0x40
  516. /* These macros specify the output selection in _mm_cmpXstrm (). */
  517. #define _SIDD_BIT_MASK 0x00
  518. #define _SIDD_UNIT_MASK 0x40
  519. /* Intrinsics for text/string processing. */
  520. #ifdef __OPTIMIZE__
  521. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  522. _mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
  523. {
  524. return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
  525. (__v16qi)__Y,
  526. __M);
  527. }
  528. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  529. _mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
  530. {
  531. return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
  532. (__v16qi)__Y,
  533. __M);
  534. }
  535. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  536. _mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  537. {
  538. return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
  539. (__v16qi)__Y, __LY,
  540. __M);
  541. }
  542. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  543. _mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  544. {
  545. return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
  546. (__v16qi)__Y, __LY,
  547. __M);
  548. }
  549. #else
  550. #define _mm_cmpistrm(X, Y, M) \
  551. ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \
  552. (__v16qi)(__m128i)(Y), (int)(M)))
  553. #define _mm_cmpistri(X, Y, M) \
  554. ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \
  555. (__v16qi)(__m128i)(Y), (int)(M)))
  556. #define _mm_cmpestrm(X, LX, Y, LY, M) \
  557. ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \
  558. (int)(LX), (__v16qi)(__m128i)(Y), \
  559. (int)(LY), (int)(M)))
  560. #define _mm_cmpestri(X, LX, Y, LY, M) \
  561. ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \
  562. (__v16qi)(__m128i)(Y), (int)(LY), \
  563. (int)(M)))
  564. #endif
  565. /* Intrinsics for text/string processing and reading values of
  566. EFlags. */
  567. #ifdef __OPTIMIZE__
  568. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  569. _mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
  570. {
  571. return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
  572. (__v16qi)__Y,
  573. __M);
  574. }
  575. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  576. _mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
  577. {
  578. return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
  579. (__v16qi)__Y,
  580. __M);
  581. }
  582. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  583. _mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
  584. {
  585. return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
  586. (__v16qi)__Y,
  587. __M);
  588. }
  589. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  590. _mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
  591. {
  592. return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
  593. (__v16qi)__Y,
  594. __M);
  595. }
  596. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  597. _mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
  598. {
  599. return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
  600. (__v16qi)__Y,
  601. __M);
  602. }
  603. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  604. _mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  605. {
  606. return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
  607. (__v16qi)__Y, __LY,
  608. __M);
  609. }
  610. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  611. _mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  612. {
  613. return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
  614. (__v16qi)__Y, __LY,
  615. __M);
  616. }
  617. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  618. _mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  619. {
  620. return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
  621. (__v16qi)__Y, __LY,
  622. __M);
  623. }
  624. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  625. _mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  626. {
  627. return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
  628. (__v16qi)__Y, __LY,
  629. __M);
  630. }
  631. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  632. _mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
  633. {
  634. return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
  635. (__v16qi)__Y, __LY,
  636. __M);
  637. }
  638. #else
  639. #define _mm_cmpistra(X, Y, M) \
  640. ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \
  641. (__v16qi)(__m128i)(Y), (int)(M)))
  642. #define _mm_cmpistrc(X, Y, M) \
  643. ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \
  644. (__v16qi)(__m128i)(Y), (int)(M)))
  645. #define _mm_cmpistro(X, Y, M) \
  646. ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \
  647. (__v16qi)(__m128i)(Y), (int)(M)))
  648. #define _mm_cmpistrs(X, Y, M) \
  649. ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \
  650. (__v16qi)(__m128i)(Y), (int)(M)))
  651. #define _mm_cmpistrz(X, Y, M) \
  652. ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \
  653. (__v16qi)(__m128i)(Y), (int)(M)))
  654. #define _mm_cmpestra(X, LX, Y, LY, M) \
  655. ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
  656. (__v16qi)(__m128i)(Y), (int)(LY), \
  657. (int)(M)))
  658. #define _mm_cmpestrc(X, LX, Y, LY, M) \
  659. ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
  660. (__v16qi)(__m128i)(Y), (int)(LY), \
  661. (int)(M)))
  662. #define _mm_cmpestro(X, LX, Y, LY, M) \
  663. ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
  664. (__v16qi)(__m128i)(Y), (int)(LY), \
  665. (int)(M)))
  666. #define _mm_cmpestrs(X, LX, Y, LY, M) \
  667. ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
  668. (__v16qi)(__m128i)(Y), (int)(LY), \
  669. (int)(M)))
  670. #define _mm_cmpestrz(X, LX, Y, LY, M) \
  671. ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
  672. (__v16qi)(__m128i)(Y), (int)(LY), \
  673. (int)(M)))
  674. #endif
  675. /* Packed integer 64-bit comparison, zeroing or filling with ones
  676. corresponding parts of result. */
  677. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  678. _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
  679. {
  680. return (__m128i) ((__v2di)__X > (__v2di)__Y);
  681. }
  682. #ifdef __DISABLE_SSE4_2__
  683. #undef __DISABLE_SSE4_2__
  684. #pragma GCC pop_options
  685. #endif /* __DISABLE_SSE4_2__ */
  686. #ifdef __DISABLE_SSE4_1__
  687. #undef __DISABLE_SSE4_1__
  688. #pragma GCC pop_options
  689. #endif /* __DISABLE_SSE4_1__ */
  690. #include <popcntintrin.h>
  691. #ifndef __SSE4_1__
  692. #pragma GCC push_options
  693. #pragma GCC target("sse4.1")
  694. #define __DISABLE_SSE4_1__
  695. #endif /* __SSE4_1__ */
  696. #ifndef __SSE4_2__
  697. #pragma GCC push_options
  698. #pragma GCC target("sse4.2")
  699. #define __DISABLE_SSE4_2__
  700. #endif /* __SSE4_1__ */
  701. /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */
  702. extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  703. _mm_crc32_u8 (unsigned int __C, unsigned char __V)
  704. {
  705. return __builtin_ia32_crc32qi (__C, __V);
  706. }
  707. extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  708. _mm_crc32_u16 (unsigned int __C, unsigned short __V)
  709. {
  710. return __builtin_ia32_crc32hi (__C, __V);
  711. }
  712. extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  713. _mm_crc32_u32 (unsigned int __C, unsigned int __V)
  714. {
  715. return __builtin_ia32_crc32si (__C, __V);
  716. }
  717. #ifdef __x86_64__
  718. extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  719. _mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
  720. {
  721. return __builtin_ia32_crc32di (__C, __V);
  722. }
  723. #endif
  724. #ifdef __DISABLE_SSE4_2__
  725. #undef __DISABLE_SSE4_2__
  726. #pragma GCC pop_options
  727. #endif /* __DISABLE_SSE4_2__ */
  728. #ifdef __DISABLE_SSE4_1__
  729. #undef __DISABLE_SSE4_1__
  730. #pragma GCC pop_options
  731. #endif /* __DISABLE_SSE4_1__ */
  732. #endif /* _SMMINTRIN_H_INCLUDED */