avx2intrin.h 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908
  1. /* Copyright (C) 2011-2019 Free Software Foundation, Inc.
  2. This file is part of GCC.
  3. GCC is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 3, or (at your option)
  6. any later version.
  7. GCC is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. Under Section 7 of GPL version 3, you are granted additional
  12. permissions described in the GCC Runtime Library Exception, version
  13. 3.1, as published by the Free Software Foundation.
  14. You should have received a copy of the GNU General Public License and
  15. a copy of the GCC Runtime Library Exception along with this program;
  16. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  17. <http://www.gnu.org/licenses/>. */
  18. #ifndef _IMMINTRIN_H_INCLUDED
  19. # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
  20. #endif
  21. #ifndef _AVX2INTRIN_H_INCLUDED
  22. #define _AVX2INTRIN_H_INCLUDED
  23. #ifndef __AVX2__
  24. #pragma GCC push_options
  25. #pragma GCC target("avx2")
  26. #define __DISABLE_AVX2__
  27. #endif /* __AVX2__ */
  28. /* Sum absolute 8-bit integer difference of adjacent groups of 4
  29. byte integers in the first 2 operands. Starting offsets within
  30. operands are determined by the 3rd mask operand. */
  31. #ifdef __OPTIMIZE__
  32. extern __inline __m256i
  33. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  34. _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
  35. {
  36. return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
  37. (__v32qi)__Y, __M);
  38. }
  39. #else
  40. #define _mm256_mpsadbw_epu8(X, Y, M) \
  41. ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
  42. (__v32qi)(__m256i)(Y), (int)(M)))
  43. #endif
  44. extern __inline __m256i
  45. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  46. _mm256_abs_epi8 (__m256i __A)
  47. {
  48. return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
  49. }
  50. extern __inline __m256i
  51. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  52. _mm256_abs_epi16 (__m256i __A)
  53. {
  54. return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
  55. }
  56. extern __inline __m256i
  57. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  58. _mm256_abs_epi32 (__m256i __A)
  59. {
  60. return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
  61. }
  62. extern __inline __m256i
  63. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  64. _mm256_packs_epi32 (__m256i __A, __m256i __B)
  65. {
  66. return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
  67. }
  68. extern __inline __m256i
  69. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  70. _mm256_packs_epi16 (__m256i __A, __m256i __B)
  71. {
  72. return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
  73. }
  74. extern __inline __m256i
  75. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  76. _mm256_packus_epi32 (__m256i __A, __m256i __B)
  77. {
  78. return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
  79. }
  80. extern __inline __m256i
  81. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  82. _mm256_packus_epi16 (__m256i __A, __m256i __B)
  83. {
  84. return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
  85. }
  86. extern __inline __m256i
  87. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  88. _mm256_add_epi8 (__m256i __A, __m256i __B)
  89. {
  90. return (__m256i) ((__v32qu)__A + (__v32qu)__B);
  91. }
  92. extern __inline __m256i
  93. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  94. _mm256_add_epi16 (__m256i __A, __m256i __B)
  95. {
  96. return (__m256i) ((__v16hu)__A + (__v16hu)__B);
  97. }
  98. extern __inline __m256i
  99. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  100. _mm256_add_epi32 (__m256i __A, __m256i __B)
  101. {
  102. return (__m256i) ((__v8su)__A + (__v8su)__B);
  103. }
  104. extern __inline __m256i
  105. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  106. _mm256_add_epi64 (__m256i __A, __m256i __B)
  107. {
  108. return (__m256i) ((__v4du)__A + (__v4du)__B);
  109. }
  110. extern __inline __m256i
  111. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  112. _mm256_adds_epi8 (__m256i __A, __m256i __B)
  113. {
  114. return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
  115. }
  116. extern __inline __m256i
  117. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  118. _mm256_adds_epi16 (__m256i __A, __m256i __B)
  119. {
  120. return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
  121. }
  122. extern __inline __m256i
  123. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  124. _mm256_adds_epu8 (__m256i __A, __m256i __B)
  125. {
  126. return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
  127. }
  128. extern __inline __m256i
  129. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  130. _mm256_adds_epu16 (__m256i __A, __m256i __B)
  131. {
  132. return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
  133. }
  134. #ifdef __OPTIMIZE__
  135. extern __inline __m256i
  136. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  137. _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
  138. {
  139. return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
  140. (__v4di)__B,
  141. __N * 8);
  142. }
  143. #else
  144. /* In that case (__N*8) will be in vreg, and insn will not be matched. */
  145. /* Use define instead */
  146. #define _mm256_alignr_epi8(A, B, N) \
  147. ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
  148. (__v4di)(__m256i)(B), \
  149. (int)(N) * 8))
  150. #endif
  151. extern __inline __m256i
  152. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  153. _mm256_and_si256 (__m256i __A, __m256i __B)
  154. {
  155. return (__m256i) ((__v4du)__A & (__v4du)__B);
  156. }
  157. extern __inline __m256i
  158. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  159. _mm256_andnot_si256 (__m256i __A, __m256i __B)
  160. {
  161. return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
  162. }
  163. extern __inline __m256i
  164. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  165. _mm256_avg_epu8 (__m256i __A, __m256i __B)
  166. {
  167. return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
  168. }
  169. extern __inline __m256i
  170. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  171. _mm256_avg_epu16 (__m256i __A, __m256i __B)
  172. {
  173. return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
  174. }
  175. extern __inline __m256i
  176. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  177. _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
  178. {
  179. return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
  180. (__v32qi)__Y,
  181. (__v32qi)__M);
  182. }
  183. #ifdef __OPTIMIZE__
  184. extern __inline __m256i
  185. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  186. _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
  187. {
  188. return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
  189. (__v16hi)__Y,
  190. __M);
  191. }
  192. #else
  193. #define _mm256_blend_epi16(X, Y, M) \
  194. ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
  195. (__v16hi)(__m256i)(Y), (int)(M)))
  196. #endif
  197. extern __inline __m256i
  198. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  199. _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
  200. {
  201. return (__m256i) ((__v32qi)__A == (__v32qi)__B);
  202. }
  203. extern __inline __m256i
  204. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  205. _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
  206. {
  207. return (__m256i) ((__v16hi)__A == (__v16hi)__B);
  208. }
  209. extern __inline __m256i
  210. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  211. _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
  212. {
  213. return (__m256i) ((__v8si)__A == (__v8si)__B);
  214. }
  215. extern __inline __m256i
  216. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  217. _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
  218. {
  219. return (__m256i) ((__v4di)__A == (__v4di)__B);
  220. }
  221. extern __inline __m256i
  222. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  223. _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
  224. {
  225. return (__m256i) ((__v32qs)__A > (__v32qs)__B);
  226. }
  227. extern __inline __m256i
  228. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  229. _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
  230. {
  231. return (__m256i) ((__v16hi)__A > (__v16hi)__B);
  232. }
  233. extern __inline __m256i
  234. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  235. _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
  236. {
  237. return (__m256i) ((__v8si)__A > (__v8si)__B);
  238. }
  239. extern __inline __m256i
  240. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  241. _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
  242. {
  243. return (__m256i) ((__v4di)__A > (__v4di)__B);
  244. }
  245. extern __inline __m256i
  246. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  247. _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
  248. {
  249. return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
  250. (__v16hi)__Y);
  251. }
  252. extern __inline __m256i
  253. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  254. _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
  255. {
  256. return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
  257. }
  258. extern __inline __m256i
  259. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  260. _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
  261. {
  262. return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
  263. (__v16hi)__Y);
  264. }
  265. extern __inline __m256i
  266. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  267. _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
  268. {
  269. return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
  270. (__v16hi)__Y);
  271. }
  272. extern __inline __m256i
  273. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  274. _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
  275. {
  276. return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
  277. }
  278. extern __inline __m256i
  279. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  280. _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
  281. {
  282. return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
  283. (__v16hi)__Y);
  284. }
  285. extern __inline __m256i
  286. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  287. _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
  288. {
  289. return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
  290. (__v32qi)__Y);
  291. }
  292. extern __inline __m256i
  293. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  294. _mm256_madd_epi16 (__m256i __A, __m256i __B)
  295. {
  296. return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
  297. (__v16hi)__B);
  298. }
  299. extern __inline __m256i
  300. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  301. _mm256_max_epi8 (__m256i __A, __m256i __B)
  302. {
  303. return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
  304. }
  305. extern __inline __m256i
  306. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  307. _mm256_max_epi16 (__m256i __A, __m256i __B)
  308. {
  309. return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
  310. }
  311. extern __inline __m256i
  312. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  313. _mm256_max_epi32 (__m256i __A, __m256i __B)
  314. {
  315. return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
  316. }
  317. extern __inline __m256i
  318. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  319. _mm256_max_epu8 (__m256i __A, __m256i __B)
  320. {
  321. return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
  322. }
  323. extern __inline __m256i
  324. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  325. _mm256_max_epu16 (__m256i __A, __m256i __B)
  326. {
  327. return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
  328. }
  329. extern __inline __m256i
  330. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  331. _mm256_max_epu32 (__m256i __A, __m256i __B)
  332. {
  333. return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
  334. }
  335. extern __inline __m256i
  336. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  337. _mm256_min_epi8 (__m256i __A, __m256i __B)
  338. {
  339. return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
  340. }
  341. extern __inline __m256i
  342. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  343. _mm256_min_epi16 (__m256i __A, __m256i __B)
  344. {
  345. return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
  346. }
  347. extern __inline __m256i
  348. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  349. _mm256_min_epi32 (__m256i __A, __m256i __B)
  350. {
  351. return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
  352. }
  353. extern __inline __m256i
  354. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  355. _mm256_min_epu8 (__m256i __A, __m256i __B)
  356. {
  357. return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
  358. }
  359. extern __inline __m256i
  360. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  361. _mm256_min_epu16 (__m256i __A, __m256i __B)
  362. {
  363. return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
  364. }
  365. extern __inline __m256i
  366. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  367. _mm256_min_epu32 (__m256i __A, __m256i __B)
  368. {
  369. return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
  370. }
  371. extern __inline int
  372. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  373. _mm256_movemask_epi8 (__m256i __A)
  374. {
  375. return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
  376. }
  377. extern __inline __m256i
  378. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  379. _mm256_cvtepi8_epi16 (__m128i __X)
  380. {
  381. return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
  382. }
  383. extern __inline __m256i
  384. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  385. _mm256_cvtepi8_epi32 (__m128i __X)
  386. {
  387. return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
  388. }
  389. extern __inline __m256i
  390. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  391. _mm256_cvtepi8_epi64 (__m128i __X)
  392. {
  393. return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
  394. }
  395. extern __inline __m256i
  396. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  397. _mm256_cvtepi16_epi32 (__m128i __X)
  398. {
  399. return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
  400. }
  401. extern __inline __m256i
  402. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  403. _mm256_cvtepi16_epi64 (__m128i __X)
  404. {
  405. return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
  406. }
  407. extern __inline __m256i
  408. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  409. _mm256_cvtepi32_epi64 (__m128i __X)
  410. {
  411. return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
  412. }
  413. extern __inline __m256i
  414. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  415. _mm256_cvtepu8_epi16 (__m128i __X)
  416. {
  417. return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
  418. }
  419. extern __inline __m256i
  420. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  421. _mm256_cvtepu8_epi32 (__m128i __X)
  422. {
  423. return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
  424. }
  425. extern __inline __m256i
  426. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  427. _mm256_cvtepu8_epi64 (__m128i __X)
  428. {
  429. return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
  430. }
  431. extern __inline __m256i
  432. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  433. _mm256_cvtepu16_epi32 (__m128i __X)
  434. {
  435. return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
  436. }
  437. extern __inline __m256i
  438. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  439. _mm256_cvtepu16_epi64 (__m128i __X)
  440. {
  441. return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
  442. }
  443. extern __inline __m256i
  444. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  445. _mm256_cvtepu32_epi64 (__m128i __X)
  446. {
  447. return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
  448. }
  449. extern __inline __m256i
  450. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  451. _mm256_mul_epi32 (__m256i __X, __m256i __Y)
  452. {
  453. return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
  454. }
  455. extern __inline __m256i
  456. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  457. _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
  458. {
  459. return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
  460. (__v16hi)__Y);
  461. }
  462. extern __inline __m256i
  463. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  464. _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
  465. {
  466. return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
  467. }
  468. extern __inline __m256i
  469. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  470. _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
  471. {
  472. return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
  473. }
  474. extern __inline __m256i
  475. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  476. _mm256_mullo_epi16 (__m256i __A, __m256i __B)
  477. {
  478. return (__m256i) ((__v16hu)__A * (__v16hu)__B);
  479. }
  480. extern __inline __m256i
  481. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  482. _mm256_mullo_epi32 (__m256i __A, __m256i __B)
  483. {
  484. return (__m256i) ((__v8su)__A * (__v8su)__B);
  485. }
  486. extern __inline __m256i
  487. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  488. _mm256_mul_epu32 (__m256i __A, __m256i __B)
  489. {
  490. return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
  491. }
  492. extern __inline __m256i
  493. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  494. _mm256_or_si256 (__m256i __A, __m256i __B)
  495. {
  496. return (__m256i) ((__v4du)__A | (__v4du)__B);
  497. }
  498. extern __inline __m256i
  499. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  500. _mm256_sad_epu8 (__m256i __A, __m256i __B)
  501. {
  502. return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
  503. }
  504. extern __inline __m256i
  505. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  506. _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
  507. {
  508. return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
  509. (__v32qi)__Y);
  510. }
  511. #ifdef __OPTIMIZE__
  512. extern __inline __m256i
  513. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  514. _mm256_shuffle_epi32 (__m256i __A, const int __mask)
  515. {
  516. return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
  517. }
  518. extern __inline __m256i
  519. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  520. _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
  521. {
  522. return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
  523. }
  524. extern __inline __m256i
  525. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  526. _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
  527. {
  528. return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
  529. }
  530. #else
  531. #define _mm256_shuffle_epi32(A, N) \
  532. ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
  533. #define _mm256_shufflehi_epi16(A, N) \
  534. ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
  535. #define _mm256_shufflelo_epi16(A, N) \
  536. ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
  537. #endif
  538. extern __inline __m256i
  539. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  540. _mm256_sign_epi8 (__m256i __X, __m256i __Y)
  541. {
  542. return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
  543. }
  544. extern __inline __m256i
  545. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  546. _mm256_sign_epi16 (__m256i __X, __m256i __Y)
  547. {
  548. return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
  549. }
  550. extern __inline __m256i
  551. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  552. _mm256_sign_epi32 (__m256i __X, __m256i __Y)
  553. {
  554. return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
  555. }
  556. #ifdef __OPTIMIZE__
  557. extern __inline __m256i
  558. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  559. _mm256_bslli_epi128 (__m256i __A, const int __N)
  560. {
  561. return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
  562. }
  563. extern __inline __m256i
  564. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  565. _mm256_slli_si256 (__m256i __A, const int __N)
  566. {
  567. return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
  568. }
  569. #else
  570. #define _mm256_bslli_epi128(A, N) \
  571. ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
  572. #define _mm256_slli_si256(A, N) \
  573. ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
  574. #endif
  575. extern __inline __m256i
  576. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  577. _mm256_slli_epi16 (__m256i __A, int __B)
  578. {
  579. return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
  580. }
  581. extern __inline __m256i
  582. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  583. _mm256_sll_epi16 (__m256i __A, __m128i __B)
  584. {
  585. return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
  586. }
  587. extern __inline __m256i
  588. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  589. _mm256_slli_epi32 (__m256i __A, int __B)
  590. {
  591. return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
  592. }
  593. extern __inline __m256i
  594. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  595. _mm256_sll_epi32 (__m256i __A, __m128i __B)
  596. {
  597. return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
  598. }
  599. extern __inline __m256i
  600. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  601. _mm256_slli_epi64 (__m256i __A, int __B)
  602. {
  603. return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
  604. }
  605. extern __inline __m256i
  606. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  607. _mm256_sll_epi64 (__m256i __A, __m128i __B)
  608. {
  609. return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
  610. }
  611. extern __inline __m256i
  612. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  613. _mm256_srai_epi16 (__m256i __A, int __B)
  614. {
  615. return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
  616. }
  617. extern __inline __m256i
  618. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  619. _mm256_sra_epi16 (__m256i __A, __m128i __B)
  620. {
  621. return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
  622. }
  623. extern __inline __m256i
  624. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  625. _mm256_srai_epi32 (__m256i __A, int __B)
  626. {
  627. return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
  628. }
  629. extern __inline __m256i
  630. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  631. _mm256_sra_epi32 (__m256i __A, __m128i __B)
  632. {
  633. return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
  634. }
  635. #ifdef __OPTIMIZE__
  636. extern __inline __m256i
  637. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  638. _mm256_bsrli_epi128 (__m256i __A, const int __N)
  639. {
  640. return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
  641. }
  642. extern __inline __m256i
  643. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  644. _mm256_srli_si256 (__m256i __A, const int __N)
  645. {
  646. return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
  647. }
  648. #else
  649. #define _mm256_bsrli_epi128(A, N) \
  650. ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
  651. #define _mm256_srli_si256(A, N) \
  652. ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
  653. #endif
  654. extern __inline __m256i
  655. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  656. _mm256_srli_epi16 (__m256i __A, int __B)
  657. {
  658. return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
  659. }
  660. extern __inline __m256i
  661. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  662. _mm256_srl_epi16 (__m256i __A, __m128i __B)
  663. {
  664. return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
  665. }
  666. extern __inline __m256i
  667. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  668. _mm256_srli_epi32 (__m256i __A, int __B)
  669. {
  670. return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
  671. }
  672. extern __inline __m256i
  673. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  674. _mm256_srl_epi32 (__m256i __A, __m128i __B)
  675. {
  676. return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
  677. }
  678. extern __inline __m256i
  679. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  680. _mm256_srli_epi64 (__m256i __A, int __B)
  681. {
  682. return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
  683. }
  684. extern __inline __m256i
  685. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  686. _mm256_srl_epi64 (__m256i __A, __m128i __B)
  687. {
  688. return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
  689. }
  690. extern __inline __m256i
  691. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  692. _mm256_sub_epi8 (__m256i __A, __m256i __B)
  693. {
  694. return (__m256i) ((__v32qu)__A - (__v32qu)__B);
  695. }
  696. extern __inline __m256i
  697. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  698. _mm256_sub_epi16 (__m256i __A, __m256i __B)
  699. {
  700. return (__m256i) ((__v16hu)__A - (__v16hu)__B);
  701. }
  702. extern __inline __m256i
  703. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  704. _mm256_sub_epi32 (__m256i __A, __m256i __B)
  705. {
  706. return (__m256i) ((__v8su)__A - (__v8su)__B);
  707. }
  708. extern __inline __m256i
  709. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  710. _mm256_sub_epi64 (__m256i __A, __m256i __B)
  711. {
  712. return (__m256i) ((__v4du)__A - (__v4du)__B);
  713. }
  714. extern __inline __m256i
  715. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  716. _mm256_subs_epi8 (__m256i __A, __m256i __B)
  717. {
  718. return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
  719. }
  720. extern __inline __m256i
  721. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  722. _mm256_subs_epi16 (__m256i __A, __m256i __B)
  723. {
  724. return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
  725. }
  726. extern __inline __m256i
  727. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  728. _mm256_subs_epu8 (__m256i __A, __m256i __B)
  729. {
  730. return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
  731. }
  732. extern __inline __m256i
  733. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  734. _mm256_subs_epu16 (__m256i __A, __m256i __B)
  735. {
  736. return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
  737. }
  738. extern __inline __m256i
  739. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  740. _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
  741. {
  742. return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
  743. }
  744. extern __inline __m256i
  745. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  746. _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
  747. {
  748. return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
  749. }
  750. extern __inline __m256i
  751. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  752. _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
  753. {
  754. return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
  755. }
  756. extern __inline __m256i
  757. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  758. _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
  759. {
  760. return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
  761. }
  762. extern __inline __m256i
  763. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  764. _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
  765. {
  766. return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
  767. }
  768. extern __inline __m256i
  769. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  770. _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
  771. {
  772. return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
  773. }
  774. extern __inline __m256i
  775. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  776. _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
  777. {
  778. return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
  779. }
  780. extern __inline __m256i
  781. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  782. _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
  783. {
  784. return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
  785. }
  786. extern __inline __m256i
  787. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  788. _mm256_xor_si256 (__m256i __A, __m256i __B)
  789. {
  790. return (__m256i) ((__v4du)__A ^ (__v4du)__B);
  791. }
  792. extern __inline __m256i
  793. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  794. _mm256_stream_load_si256 (__m256i const *__X)
  795. {
  796. return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
  797. }
  798. extern __inline __m128
  799. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  800. _mm_broadcastss_ps (__m128 __X)
  801. {
  802. return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
  803. }
  804. extern __inline __m256
  805. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  806. _mm256_broadcastss_ps (__m128 __X)
  807. {
  808. return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
  809. }
  810. extern __inline __m256d
  811. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  812. _mm256_broadcastsd_pd (__m128d __X)
  813. {
  814. return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
  815. }
  816. extern __inline __m256i
  817. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  818. _mm256_broadcastsi128_si256 (__m128i __X)
  819. {
  820. return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
  821. }
  822. #ifdef __OPTIMIZE__
  823. extern __inline __m128i
  824. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  825. _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
  826. {
  827. return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
  828. (__v4si)__Y,
  829. __M);
  830. }
  831. #else
  832. #define _mm_blend_epi32(X, Y, M) \
  833. ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
  834. (__v4si)(__m128i)(Y), (int)(M)))
  835. #endif
  836. #ifdef __OPTIMIZE__
  837. extern __inline __m256i
  838. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  839. _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
  840. {
  841. return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
  842. (__v8si)__Y,
  843. __M);
  844. }
  845. #else
  846. #define _mm256_blend_epi32(X, Y, M) \
  847. ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
  848. (__v8si)(__m256i)(Y), (int)(M)))
  849. #endif
  850. extern __inline __m256i
  851. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  852. _mm256_broadcastb_epi8 (__m128i __X)
  853. {
  854. return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
  855. }
  856. extern __inline __m256i
  857. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  858. _mm256_broadcastw_epi16 (__m128i __X)
  859. {
  860. return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
  861. }
  862. extern __inline __m256i
  863. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  864. _mm256_broadcastd_epi32 (__m128i __X)
  865. {
  866. return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
  867. }
  868. extern __inline __m256i
  869. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  870. _mm256_broadcastq_epi64 (__m128i __X)
  871. {
  872. return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
  873. }
  874. extern __inline __m128i
  875. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  876. _mm_broadcastb_epi8 (__m128i __X)
  877. {
  878. return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
  879. }
  880. extern __inline __m128i
  881. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  882. _mm_broadcastw_epi16 (__m128i __X)
  883. {
  884. return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
  885. }
  886. extern __inline __m128i
  887. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  888. _mm_broadcastd_epi32 (__m128i __X)
  889. {
  890. return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
  891. }
  892. extern __inline __m128i
  893. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  894. _mm_broadcastq_epi64 (__m128i __X)
  895. {
  896. return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
  897. }
  898. extern __inline __m256i
  899. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  900. _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
  901. {
  902. return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
  903. }
  904. #ifdef __OPTIMIZE__
  905. extern __inline __m256d
  906. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  907. _mm256_permute4x64_pd (__m256d __X, const int __M)
  908. {
  909. return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
  910. }
  911. #else
  912. #define _mm256_permute4x64_pd(X, M) \
  913. ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
  914. #endif
  915. extern __inline __m256
  916. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  917. _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
  918. {
  919. return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
  920. }
  921. #ifdef __OPTIMIZE__
  922. extern __inline __m256i
  923. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  924. _mm256_permute4x64_epi64 (__m256i __X, const int __M)
  925. {
  926. return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
  927. }
  928. #else
  929. #define _mm256_permute4x64_epi64(X, M) \
  930. ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
  931. #endif
  932. #ifdef __OPTIMIZE__
  933. extern __inline __m256i
  934. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  935. _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
  936. {
  937. return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
  938. }
  939. #else
  940. #define _mm256_permute2x128_si256(X, Y, M) \
  941. ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
  942. #endif
  943. #ifdef __OPTIMIZE__
  944. extern __inline __m128i
  945. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  946. _mm256_extracti128_si256 (__m256i __X, const int __M)
  947. {
  948. return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
  949. }
  950. #else
  951. #define _mm256_extracti128_si256(X, M) \
  952. ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
  953. #endif
  954. #ifdef __OPTIMIZE__
  955. extern __inline __m256i
  956. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  957. _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
  958. {
  959. return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
  960. }
  961. #else
  962. #define _mm256_inserti128_si256(X, Y, M) \
  963. ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
  964. (__v2di)(__m128i)(Y), \
  965. (int)(M)))
  966. #endif
  967. extern __inline __m256i
  968. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  969. _mm256_maskload_epi32 (int const *__X, __m256i __M )
  970. {
  971. return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
  972. (__v8si)__M);
  973. }
  974. extern __inline __m256i
  975. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  976. _mm256_maskload_epi64 (long long const *__X, __m256i __M )
  977. {
  978. return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
  979. (__v4di)__M);
  980. }
  981. extern __inline __m128i
  982. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  983. _mm_maskload_epi32 (int const *__X, __m128i __M )
  984. {
  985. return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
  986. (__v4si)__M);
  987. }
  988. extern __inline __m128i
  989. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  990. _mm_maskload_epi64 (long long const *__X, __m128i __M )
  991. {
  992. return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
  993. (__v2di)__M);
  994. }
  995. extern __inline void
  996. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  997. _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
  998. {
  999. __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
  1000. }
  1001. extern __inline void
  1002. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1003. _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
  1004. {
  1005. __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
  1006. }
  1007. extern __inline void
  1008. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1009. _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
  1010. {
  1011. __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
  1012. }
  1013. extern __inline void
  1014. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1015. _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
  1016. {
  1017. __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
  1018. }
  1019. extern __inline __m256i
  1020. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1021. _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
  1022. {
  1023. return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
  1024. }
  1025. extern __inline __m128i
  1026. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1027. _mm_sllv_epi32 (__m128i __X, __m128i __Y)
  1028. {
  1029. return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
  1030. }
  1031. extern __inline __m256i
  1032. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1033. _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
  1034. {
  1035. return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
  1036. }
  1037. extern __inline __m128i
  1038. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1039. _mm_sllv_epi64 (__m128i __X, __m128i __Y)
  1040. {
  1041. return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
  1042. }
  1043. extern __inline __m256i
  1044. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1045. _mm256_srav_epi32 (__m256i __X, __m256i __Y)
  1046. {
  1047. return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
  1048. }
  1049. extern __inline __m128i
  1050. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1051. _mm_srav_epi32 (__m128i __X, __m128i __Y)
  1052. {
  1053. return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
  1054. }
  1055. extern __inline __m256i
  1056. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1057. _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
  1058. {
  1059. return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
  1060. }
  1061. extern __inline __m128i
  1062. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1063. _mm_srlv_epi32 (__m128i __X, __m128i __Y)
  1064. {
  1065. return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
  1066. }
  1067. extern __inline __m256i
  1068. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1069. _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
  1070. {
  1071. return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
  1072. }
  1073. extern __inline __m128i
  1074. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1075. _mm_srlv_epi64 (__m128i __X, __m128i __Y)
  1076. {
  1077. return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
  1078. }
  1079. #ifdef __OPTIMIZE__
  1080. extern __inline __m128d
  1081. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1082. _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
  1083. {
  1084. __v2df __zero = _mm_setzero_pd ();
  1085. __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
  1086. return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
  1087. __base,
  1088. (__v4si)__index,
  1089. __mask,
  1090. __scale);
  1091. }
  1092. extern __inline __m128d
  1093. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1094. _mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
  1095. __m128d __mask, const int __scale)
  1096. {
  1097. return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
  1098. __base,
  1099. (__v4si)__index,
  1100. (__v2df)__mask,
  1101. __scale);
  1102. }
  1103. extern __inline __m256d
  1104. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1105. _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
  1106. {
  1107. __v4df __zero = _mm256_setzero_pd ();
  1108. __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
  1109. return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
  1110. __base,
  1111. (__v4si)__index,
  1112. __mask,
  1113. __scale);
  1114. }
  1115. extern __inline __m256d
  1116. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1117. _mm256_mask_i32gather_pd (__m256d __src, double const *__base,
  1118. __m128i __index, __m256d __mask, const int __scale)
  1119. {
  1120. return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
  1121. __base,
  1122. (__v4si)__index,
  1123. (__v4df)__mask,
  1124. __scale);
  1125. }
  1126. extern __inline __m128d
  1127. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1128. _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
  1129. {
  1130. __v2df __src = _mm_setzero_pd ();
  1131. __v2df __mask = _mm_cmpeq_pd (__src, __src);
  1132. return (__m128d) __builtin_ia32_gatherdiv2df (__src,
  1133. __base,
  1134. (__v2di)__index,
  1135. __mask,
  1136. __scale);
  1137. }
  1138. extern __inline __m128d
  1139. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1140. _mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
  1141. __m128d __mask, const int __scale)
  1142. {
  1143. return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
  1144. __base,
  1145. (__v2di)__index,
  1146. (__v2df)__mask,
  1147. __scale);
  1148. }
  1149. extern __inline __m256d
  1150. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1151. _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
  1152. {
  1153. __v4df __src = _mm256_setzero_pd ();
  1154. __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
  1155. return (__m256d) __builtin_ia32_gatherdiv4df (__src,
  1156. __base,
  1157. (__v4di)__index,
  1158. __mask,
  1159. __scale);
  1160. }
  1161. extern __inline __m256d
  1162. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1163. _mm256_mask_i64gather_pd (__m256d __src, double const *__base,
  1164. __m256i __index, __m256d __mask, const int __scale)
  1165. {
  1166. return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
  1167. __base,
  1168. (__v4di)__index,
  1169. (__v4df)__mask,
  1170. __scale);
  1171. }
  1172. extern __inline __m128
  1173. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1174. _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
  1175. {
  1176. __v4sf __src = _mm_setzero_ps ();
  1177. __v4sf __mask = _mm_cmpeq_ps (__src, __src);
  1178. return (__m128) __builtin_ia32_gathersiv4sf (__src,
  1179. __base,
  1180. (__v4si)__index,
  1181. __mask,
  1182. __scale);
  1183. }
  1184. extern __inline __m128
  1185. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1186. _mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
  1187. __m128 __mask, const int __scale)
  1188. {
  1189. return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
  1190. __base,
  1191. (__v4si)__index,
  1192. (__v4sf)__mask,
  1193. __scale);
  1194. }
  1195. extern __inline __m256
  1196. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1197. _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
  1198. {
  1199. __v8sf __src = _mm256_setzero_ps ();
  1200. __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
  1201. return (__m256) __builtin_ia32_gathersiv8sf (__src,
  1202. __base,
  1203. (__v8si)__index,
  1204. __mask,
  1205. __scale);
  1206. }
  1207. extern __inline __m256
  1208. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1209. _mm256_mask_i32gather_ps (__m256 __src, float const *__base,
  1210. __m256i __index, __m256 __mask, const int __scale)
  1211. {
  1212. return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
  1213. __base,
  1214. (__v8si)__index,
  1215. (__v8sf)__mask,
  1216. __scale);
  1217. }
  1218. extern __inline __m128
  1219. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1220. _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
  1221. {
  1222. __v4sf __src = _mm_setzero_ps ();
  1223. __v4sf __mask = _mm_cmpeq_ps (__src, __src);
  1224. return (__m128) __builtin_ia32_gatherdiv4sf (__src,
  1225. __base,
  1226. (__v2di)__index,
  1227. __mask,
  1228. __scale);
  1229. }
  1230. extern __inline __m128
  1231. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1232. _mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
  1233. __m128 __mask, const int __scale)
  1234. {
  1235. return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
  1236. __base,
  1237. (__v2di)__index,
  1238. (__v4sf)__mask,
  1239. __scale);
  1240. }
  1241. extern __inline __m128
  1242. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1243. _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
  1244. {
  1245. __v4sf __src = _mm_setzero_ps ();
  1246. __v4sf __mask = _mm_cmpeq_ps (__src, __src);
  1247. return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
  1248. __base,
  1249. (__v4di)__index,
  1250. __mask,
  1251. __scale);
  1252. }
  1253. extern __inline __m128
  1254. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1255. _mm256_mask_i64gather_ps (__m128 __src, float const *__base,
  1256. __m256i __index, __m128 __mask, const int __scale)
  1257. {
  1258. return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
  1259. __base,
  1260. (__v4di)__index,
  1261. (__v4sf)__mask,
  1262. __scale);
  1263. }
  1264. extern __inline __m128i
  1265. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1266. _mm_i32gather_epi64 (long long int const *__base,
  1267. __m128i __index, const int __scale)
  1268. {
  1269. __v2di __src = __extension__ (__v2di){ 0, 0 };
  1270. __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
  1271. return (__m128i) __builtin_ia32_gathersiv2di (__src,
  1272. __base,
  1273. (__v4si)__index,
  1274. __mask,
  1275. __scale);
  1276. }
  1277. extern __inline __m128i
  1278. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1279. _mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
  1280. __m128i __index, __m128i __mask, const int __scale)
  1281. {
  1282. return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
  1283. __base,
  1284. (__v4si)__index,
  1285. (__v2di)__mask,
  1286. __scale);
  1287. }
  1288. extern __inline __m256i
  1289. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1290. _mm256_i32gather_epi64 (long long int const *__base,
  1291. __m128i __index, const int __scale)
  1292. {
  1293. __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
  1294. __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
  1295. return (__m256i) __builtin_ia32_gathersiv4di (__src,
  1296. __base,
  1297. (__v4si)__index,
  1298. __mask,
  1299. __scale);
  1300. }
  1301. extern __inline __m256i
  1302. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1303. _mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
  1304. __m128i __index, __m256i __mask,
  1305. const int __scale)
  1306. {
  1307. return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
  1308. __base,
  1309. (__v4si)__index,
  1310. (__v4di)__mask,
  1311. __scale);
  1312. }
  1313. extern __inline __m128i
  1314. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1315. _mm_i64gather_epi64 (long long int const *__base,
  1316. __m128i __index, const int __scale)
  1317. {
  1318. __v2di __src = __extension__ (__v2di){ 0, 0 };
  1319. __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
  1320. return (__m128i) __builtin_ia32_gatherdiv2di (__src,
  1321. __base,
  1322. (__v2di)__index,
  1323. __mask,
  1324. __scale);
  1325. }
  1326. extern __inline __m128i
  1327. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1328. _mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
  1329. __m128i __index, __m128i __mask, const int __scale)
  1330. {
  1331. return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
  1332. __base,
  1333. (__v2di)__index,
  1334. (__v2di)__mask,
  1335. __scale);
  1336. }
  1337. extern __inline __m256i
  1338. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1339. _mm256_i64gather_epi64 (long long int const *__base,
  1340. __m256i __index, const int __scale)
  1341. {
  1342. __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
  1343. __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
  1344. return (__m256i) __builtin_ia32_gatherdiv4di (__src,
  1345. __base,
  1346. (__v4di)__index,
  1347. __mask,
  1348. __scale);
  1349. }
  1350. extern __inline __m256i
  1351. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1352. _mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
  1353. __m256i __index, __m256i __mask,
  1354. const int __scale)
  1355. {
  1356. return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
  1357. __base,
  1358. (__v4di)__index,
  1359. (__v4di)__mask,
  1360. __scale);
  1361. }
  1362. extern __inline __m128i
  1363. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1364. _mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
  1365. {
  1366. __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
  1367. __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
  1368. return (__m128i) __builtin_ia32_gathersiv4si (__src,
  1369. __base,
  1370. (__v4si)__index,
  1371. __mask,
  1372. __scale);
  1373. }
  1374. extern __inline __m128i
  1375. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1376. _mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
  1377. __m128i __mask, const int __scale)
  1378. {
  1379. return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
  1380. __base,
  1381. (__v4si)__index,
  1382. (__v4si)__mask,
  1383. __scale);
  1384. }
  1385. extern __inline __m256i
  1386. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1387. _mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
  1388. {
  1389. __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
  1390. __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
  1391. return (__m256i) __builtin_ia32_gathersiv8si (__src,
  1392. __base,
  1393. (__v8si)__index,
  1394. __mask,
  1395. __scale);
  1396. }
  1397. extern __inline __m256i
  1398. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1399. _mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
  1400. __m256i __index, __m256i __mask,
  1401. const int __scale)
  1402. {
  1403. return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
  1404. __base,
  1405. (__v8si)__index,
  1406. (__v8si)__mask,
  1407. __scale);
  1408. }
  1409. extern __inline __m128i
  1410. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1411. _mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
  1412. {
  1413. __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
  1414. __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
  1415. return (__m128i) __builtin_ia32_gatherdiv4si (__src,
  1416. __base,
  1417. (__v2di)__index,
  1418. __mask,
  1419. __scale);
  1420. }
  1421. extern __inline __m128i
  1422. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1423. _mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
  1424. __m128i __mask, const int __scale)
  1425. {
  1426. return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
  1427. __base,
  1428. (__v2di)__index,
  1429. (__v4si)__mask,
  1430. __scale);
  1431. }
  1432. extern __inline __m128i
  1433. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1434. _mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
  1435. {
  1436. __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
  1437. __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
  1438. return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
  1439. __base,
  1440. (__v4di)__index,
  1441. __mask,
  1442. __scale);
  1443. }
  1444. extern __inline __m128i
  1445. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1446. _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
  1447. __m256i __index, __m128i __mask,
  1448. const int __scale)
  1449. {
  1450. return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
  1451. __base,
  1452. (__v4di)__index,
  1453. (__v4si)__mask,
  1454. __scale);
  1455. }
  1456. #else /* __OPTIMIZE__ */
  1457. #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
  1458. (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
  1459. (double const *)BASE, \
  1460. (__v4si)(__m128i)INDEX, \
  1461. (__v2df)_mm_set1_pd( \
  1462. (double)(long long int) -1), \
  1463. (int)SCALE)
  1464. #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1465. (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
  1466. (double const *)BASE, \
  1467. (__v4si)(__m128i)INDEX, \
  1468. (__v2df)(__m128d)MASK, \
  1469. (int)SCALE)
  1470. #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
  1471. (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
  1472. (double const *)BASE, \
  1473. (__v4si)(__m128i)INDEX, \
  1474. (__v4df)_mm256_set1_pd( \
  1475. (double)(long long int) -1), \
  1476. (int)SCALE)
  1477. #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1478. (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
  1479. (double const *)BASE, \
  1480. (__v4si)(__m128i)INDEX, \
  1481. (__v4df)(__m256d)MASK, \
  1482. (int)SCALE)
  1483. #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
  1484. (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
  1485. (double const *)BASE, \
  1486. (__v2di)(__m128i)INDEX, \
  1487. (__v2df)_mm_set1_pd( \
  1488. (double)(long long int) -1), \
  1489. (int)SCALE)
  1490. #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1491. (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
  1492. (double const *)BASE, \
  1493. (__v2di)(__m128i)INDEX, \
  1494. (__v2df)(__m128d)MASK, \
  1495. (int)SCALE)
  1496. #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
  1497. (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
  1498. (double const *)BASE, \
  1499. (__v4di)(__m256i)INDEX, \
  1500. (__v4df)_mm256_set1_pd( \
  1501. (double)(long long int) -1), \
  1502. (int)SCALE)
  1503. #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1504. (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
  1505. (double const *)BASE, \
  1506. (__v4di)(__m256i)INDEX, \
  1507. (__v4df)(__m256d)MASK, \
  1508. (int)SCALE)
  1509. #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
  1510. (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
  1511. (float const *)BASE, \
  1512. (__v4si)(__m128i)INDEX, \
  1513. _mm_set1_ps ((float)(int) -1), \
  1514. (int)SCALE)
  1515. #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1516. (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128)SRC, \
  1517. (float const *)BASE, \
  1518. (__v4si)(__m128i)INDEX, \
  1519. (__v4sf)(__m128)MASK, \
  1520. (int)SCALE)
  1521. #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
  1522. (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
  1523. (float const *)BASE, \
  1524. (__v8si)(__m256i)INDEX, \
  1525. (__v8sf)_mm256_set1_ps ( \
  1526. (float)(int) -1), \
  1527. (int)SCALE)
  1528. #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1529. (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
  1530. (float const *)BASE, \
  1531. (__v8si)(__m256i)INDEX, \
  1532. (__v8sf)(__m256)MASK, \
  1533. (int)SCALE)
  1534. #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
  1535. (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
  1536. (float const *)BASE, \
  1537. (__v2di)(__m128i)INDEX, \
  1538. (__v4sf)_mm_set1_ps ( \
  1539. (float)(int) -1), \
  1540. (int)SCALE)
  1541. #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1542. (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
  1543. (float const *)BASE, \
  1544. (__v2di)(__m128i)INDEX, \
  1545. (__v4sf)(__m128)MASK, \
  1546. (int)SCALE)
  1547. #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
  1548. (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
  1549. (float const *)BASE, \
  1550. (__v4di)(__m256i)INDEX, \
  1551. (__v4sf)_mm_set1_ps( \
  1552. (float)(int) -1), \
  1553. (int)SCALE)
  1554. #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1555. (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
  1556. (float const *)BASE, \
  1557. (__v4di)(__m256i)INDEX, \
  1558. (__v4sf)(__m128)MASK, \
  1559. (int)SCALE)
  1560. #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
  1561. (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
  1562. (long long const *)BASE, \
  1563. (__v4si)(__m128i)INDEX, \
  1564. (__v2di)_mm_set1_epi64x (-1), \
  1565. (int)SCALE)
  1566. #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1567. (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
  1568. (long long const *)BASE, \
  1569. (__v4si)(__m128i)INDEX, \
  1570. (__v2di)(__m128i)MASK, \
  1571. (int)SCALE)
  1572. #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
  1573. (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
  1574. (long long const *)BASE, \
  1575. (__v4si)(__m128i)INDEX, \
  1576. (__v4di)_mm256_set1_epi64x (-1), \
  1577. (int)SCALE)
  1578. #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1579. (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
  1580. (long long const *)BASE, \
  1581. (__v4si)(__m128i)INDEX, \
  1582. (__v4di)(__m256i)MASK, \
  1583. (int)SCALE)
  1584. #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
  1585. (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
  1586. (long long const *)BASE, \
  1587. (__v2di)(__m128i)INDEX, \
  1588. (__v2di)_mm_set1_epi64x (-1), \
  1589. (int)SCALE)
  1590. #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1591. (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
  1592. (long long const *)BASE, \
  1593. (__v2di)(__m128i)INDEX, \
  1594. (__v2di)(__m128i)MASK, \
  1595. (int)SCALE)
  1596. #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
  1597. (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
  1598. (long long const *)BASE, \
  1599. (__v4di)(__m256i)INDEX, \
  1600. (__v4di)_mm256_set1_epi64x (-1), \
  1601. (int)SCALE)
  1602. #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1603. (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
  1604. (long long const *)BASE, \
  1605. (__v4di)(__m256i)INDEX, \
  1606. (__v4di)(__m256i)MASK, \
  1607. (int)SCALE)
  1608. #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
  1609. (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
  1610. (int const *)BASE, \
  1611. (__v4si)(__m128i)INDEX, \
  1612. (__v4si)_mm_set1_epi32 (-1), \
  1613. (int)SCALE)
  1614. #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1615. (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
  1616. (int const *)BASE, \
  1617. (__v4si)(__m128i)INDEX, \
  1618. (__v4si)(__m128i)MASK, \
  1619. (int)SCALE)
  1620. #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
  1621. (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
  1622. (int const *)BASE, \
  1623. (__v8si)(__m256i)INDEX, \
  1624. (__v8si)_mm256_set1_epi32 (-1), \
  1625. (int)SCALE)
  1626. #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1627. (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
  1628. (int const *)BASE, \
  1629. (__v8si)(__m256i)INDEX, \
  1630. (__v8si)(__m256i)MASK, \
  1631. (int)SCALE)
  1632. #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
  1633. (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
  1634. (int const *)BASE, \
  1635. (__v2di)(__m128i)INDEX, \
  1636. (__v4si)_mm_set1_epi32 (-1), \
  1637. (int)SCALE)
  1638. #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1639. (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
  1640. (int const *)BASE, \
  1641. (__v2di)(__m128i)INDEX, \
  1642. (__v4si)(__m128i)MASK, \
  1643. (int)SCALE)
  1644. #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
  1645. (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
  1646. (int const *)BASE, \
  1647. (__v4di)(__m256i)INDEX, \
  1648. (__v4si)_mm_set1_epi32(-1), \
  1649. (int)SCALE)
  1650. #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1651. (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
  1652. (int const *)BASE, \
  1653. (__v4di)(__m256i)INDEX, \
  1654. (__v4si)(__m128i)MASK, \
  1655. (int)SCALE)
  1656. #endif /* __OPTIMIZE__ */
  1657. #ifdef __DISABLE_AVX2__
  1658. #undef __DISABLE_AVX2__
  1659. #pragma GCC pop_options
  1660. #endif /* __DISABLE_AVX2__ */
  1661. #endif /* _AVX2INTRIN_H_INCLUDED */