avx2intrin.h 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923
  1. /* Copyright (C) 2011-2022 Free Software Foundation, Inc.
  2. This file is part of GCC.
  3. GCC is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 3, or (at your option)
  6. any later version.
  7. GCC is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. Under Section 7 of GPL version 3, you are granted additional
  12. permissions described in the GCC Runtime Library Exception, version
  13. 3.1, as published by the Free Software Foundation.
  14. You should have received a copy of the GNU General Public License and
  15. a copy of the GCC Runtime Library Exception along with this program;
  16. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  17. <http://www.gnu.org/licenses/>. */
  18. #ifndef _IMMINTRIN_H_INCLUDED
  19. # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
  20. #endif
  21. #ifndef _AVX2INTRIN_H_INCLUDED
  22. #define _AVX2INTRIN_H_INCLUDED
  23. #ifndef __AVX2__
  24. #pragma GCC push_options
  25. #pragma GCC target("avx2")
  26. #define __DISABLE_AVX2__
  27. #endif /* __AVX2__ */
  28. /* Sum absolute 8-bit integer difference of adjacent groups of 4
  29. byte integers in the first 2 operands. Starting offsets within
  30. operands are determined by the 3rd mask operand. */
  31. #ifdef __OPTIMIZE__
  32. extern __inline __m256i
  33. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  34. _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
  35. {
  36. return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
  37. (__v32qi)__Y, __M);
  38. }
  39. #else
  40. #define _mm256_mpsadbw_epu8(X, Y, M) \
  41. ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
  42. (__v32qi)(__m256i)(Y), (int)(M)))
  43. #endif
  44. extern __inline __m256i
  45. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  46. _mm256_abs_epi8 (__m256i __A)
  47. {
  48. return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
  49. }
  50. extern __inline __m256i
  51. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  52. _mm256_abs_epi16 (__m256i __A)
  53. {
  54. return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
  55. }
  56. extern __inline __m256i
  57. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  58. _mm256_abs_epi32 (__m256i __A)
  59. {
  60. return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
  61. }
  62. extern __inline __m256i
  63. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  64. _mm256_packs_epi32 (__m256i __A, __m256i __B)
  65. {
  66. return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
  67. }
  68. extern __inline __m256i
  69. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  70. _mm256_packs_epi16 (__m256i __A, __m256i __B)
  71. {
  72. return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
  73. }
  74. extern __inline __m256i
  75. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  76. _mm256_packus_epi32 (__m256i __A, __m256i __B)
  77. {
  78. return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
  79. }
  80. extern __inline __m256i
  81. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  82. _mm256_packus_epi16 (__m256i __A, __m256i __B)
  83. {
  84. return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
  85. }
  86. extern __inline __m256i
  87. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  88. _mm256_add_epi8 (__m256i __A, __m256i __B)
  89. {
  90. return (__m256i) ((__v32qu)__A + (__v32qu)__B);
  91. }
  92. extern __inline __m256i
  93. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  94. _mm256_add_epi16 (__m256i __A, __m256i __B)
  95. {
  96. return (__m256i) ((__v16hu)__A + (__v16hu)__B);
  97. }
  98. extern __inline __m256i
  99. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  100. _mm256_add_epi32 (__m256i __A, __m256i __B)
  101. {
  102. return (__m256i) ((__v8su)__A + (__v8su)__B);
  103. }
  104. extern __inline __m256i
  105. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  106. _mm256_add_epi64 (__m256i __A, __m256i __B)
  107. {
  108. return (__m256i) ((__v4du)__A + (__v4du)__B);
  109. }
  110. extern __inline __m256i
  111. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  112. _mm256_adds_epi8 (__m256i __A, __m256i __B)
  113. {
  114. return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
  115. }
  116. extern __inline __m256i
  117. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  118. _mm256_adds_epi16 (__m256i __A, __m256i __B)
  119. {
  120. return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
  121. }
  122. extern __inline __m256i
  123. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  124. _mm256_adds_epu8 (__m256i __A, __m256i __B)
  125. {
  126. return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
  127. }
  128. extern __inline __m256i
  129. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  130. _mm256_adds_epu16 (__m256i __A, __m256i __B)
  131. {
  132. return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
  133. }
  134. #ifdef __OPTIMIZE__
  135. extern __inline __m256i
  136. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  137. _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
  138. {
  139. return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
  140. (__v4di)__B,
  141. __N * 8);
  142. }
  143. #else
  144. /* In that case (__N*8) will be in vreg, and insn will not be matched. */
  145. /* Use define instead */
  146. #define _mm256_alignr_epi8(A, B, N) \
  147. ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
  148. (__v4di)(__m256i)(B), \
  149. (int)(N) * 8))
  150. #endif
  151. extern __inline __m256i
  152. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  153. _mm256_and_si256 (__m256i __A, __m256i __B)
  154. {
  155. return (__m256i) ((__v4du)__A & (__v4du)__B);
  156. }
  157. extern __inline __m256i
  158. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  159. _mm256_andnot_si256 (__m256i __A, __m256i __B)
  160. {
  161. return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
  162. }
  163. extern __inline __m256i
  164. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  165. _mm256_avg_epu8 (__m256i __A, __m256i __B)
  166. {
  167. return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
  168. }
  169. extern __inline __m256i
  170. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  171. _mm256_avg_epu16 (__m256i __A, __m256i __B)
  172. {
  173. return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
  174. }
  175. extern __inline __m256i
  176. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  177. _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
  178. {
  179. return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
  180. (__v32qi)__Y,
  181. (__v32qi)__M);
  182. }
  183. #ifdef __OPTIMIZE__
  184. extern __inline __m256i
  185. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  186. _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
  187. {
  188. return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
  189. (__v16hi)__Y,
  190. __M);
  191. }
  192. #else
  193. #define _mm256_blend_epi16(X, Y, M) \
  194. ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
  195. (__v16hi)(__m256i)(Y), (int)(M)))
  196. #endif
  197. extern __inline __m256i
  198. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  199. _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
  200. {
  201. return (__m256i) ((__v32qi)__A == (__v32qi)__B);
  202. }
  203. extern __inline __m256i
  204. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  205. _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
  206. {
  207. return (__m256i) ((__v16hi)__A == (__v16hi)__B);
  208. }
  209. extern __inline __m256i
  210. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  211. _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
  212. {
  213. return (__m256i) ((__v8si)__A == (__v8si)__B);
  214. }
  215. extern __inline __m256i
  216. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  217. _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
  218. {
  219. return (__m256i) ((__v4di)__A == (__v4di)__B);
  220. }
  221. extern __inline __m256i
  222. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  223. _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
  224. {
  225. return (__m256i) ((__v32qs)__A > (__v32qs)__B);
  226. }
  227. extern __inline __m256i
  228. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  229. _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
  230. {
  231. return (__m256i) ((__v16hi)__A > (__v16hi)__B);
  232. }
  233. extern __inline __m256i
  234. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  235. _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
  236. {
  237. return (__m256i) ((__v8si)__A > (__v8si)__B);
  238. }
  239. extern __inline __m256i
  240. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  241. _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
  242. {
  243. return (__m256i) ((__v4di)__A > (__v4di)__B);
  244. }
  245. extern __inline __m256i
  246. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  247. _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
  248. {
  249. return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
  250. (__v16hi)__Y);
  251. }
  252. extern __inline __m256i
  253. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  254. _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
  255. {
  256. return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
  257. }
  258. extern __inline __m256i
  259. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  260. _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
  261. {
  262. return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
  263. (__v16hi)__Y);
  264. }
  265. extern __inline __m256i
  266. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  267. _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
  268. {
  269. return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
  270. (__v16hi)__Y);
  271. }
  272. extern __inline __m256i
  273. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  274. _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
  275. {
  276. return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
  277. }
  278. extern __inline __m256i
  279. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  280. _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
  281. {
  282. return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
  283. (__v16hi)__Y);
  284. }
  285. extern __inline __m256i
  286. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  287. _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
  288. {
  289. return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
  290. (__v32qi)__Y);
  291. }
  292. extern __inline __m256i
  293. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  294. _mm256_madd_epi16 (__m256i __A, __m256i __B)
  295. {
  296. return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
  297. (__v16hi)__B);
  298. }
  299. extern __inline __m256i
  300. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  301. _mm256_max_epi8 (__m256i __A, __m256i __B)
  302. {
  303. return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
  304. }
  305. extern __inline __m256i
  306. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  307. _mm256_max_epi16 (__m256i __A, __m256i __B)
  308. {
  309. return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
  310. }
  311. extern __inline __m256i
  312. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  313. _mm256_max_epi32 (__m256i __A, __m256i __B)
  314. {
  315. return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
  316. }
  317. extern __inline __m256i
  318. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  319. _mm256_max_epu8 (__m256i __A, __m256i __B)
  320. {
  321. return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
  322. }
  323. extern __inline __m256i
  324. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  325. _mm256_max_epu16 (__m256i __A, __m256i __B)
  326. {
  327. return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
  328. }
  329. extern __inline __m256i
  330. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  331. _mm256_max_epu32 (__m256i __A, __m256i __B)
  332. {
  333. return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
  334. }
  335. extern __inline __m256i
  336. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  337. _mm256_min_epi8 (__m256i __A, __m256i __B)
  338. {
  339. return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
  340. }
  341. extern __inline __m256i
  342. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  343. _mm256_min_epi16 (__m256i __A, __m256i __B)
  344. {
  345. return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
  346. }
  347. extern __inline __m256i
  348. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  349. _mm256_min_epi32 (__m256i __A, __m256i __B)
  350. {
  351. return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
  352. }
  353. extern __inline __m256i
  354. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  355. _mm256_min_epu8 (__m256i __A, __m256i __B)
  356. {
  357. return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
  358. }
  359. extern __inline __m256i
  360. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  361. _mm256_min_epu16 (__m256i __A, __m256i __B)
  362. {
  363. return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
  364. }
  365. extern __inline __m256i
  366. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  367. _mm256_min_epu32 (__m256i __A, __m256i __B)
  368. {
  369. return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
  370. }
  371. extern __inline int
  372. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  373. _mm256_movemask_epi8 (__m256i __A)
  374. {
  375. return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
  376. }
  377. extern __inline __m256i
  378. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  379. _mm256_cvtepi8_epi16 (__m128i __X)
  380. {
  381. return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
  382. }
  383. extern __inline __m256i
  384. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  385. _mm256_cvtepi8_epi32 (__m128i __X)
  386. {
  387. return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
  388. }
  389. extern __inline __m256i
  390. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  391. _mm256_cvtepi8_epi64 (__m128i __X)
  392. {
  393. return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
  394. }
  395. extern __inline __m256i
  396. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  397. _mm256_cvtepi16_epi32 (__m128i __X)
  398. {
  399. return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
  400. }
  401. extern __inline __m256i
  402. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  403. _mm256_cvtepi16_epi64 (__m128i __X)
  404. {
  405. return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
  406. }
  407. extern __inline __m256i
  408. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  409. _mm256_cvtepi32_epi64 (__m128i __X)
  410. {
  411. return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
  412. }
  413. extern __inline __m256i
  414. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  415. _mm256_cvtepu8_epi16 (__m128i __X)
  416. {
  417. return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
  418. }
  419. extern __inline __m256i
  420. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  421. _mm256_cvtepu8_epi32 (__m128i __X)
  422. {
  423. return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
  424. }
  425. extern __inline __m256i
  426. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  427. _mm256_cvtepu8_epi64 (__m128i __X)
  428. {
  429. return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
  430. }
  431. extern __inline __m256i
  432. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  433. _mm256_cvtepu16_epi32 (__m128i __X)
  434. {
  435. return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
  436. }
  437. extern __inline __m256i
  438. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  439. _mm256_cvtepu16_epi64 (__m128i __X)
  440. {
  441. return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
  442. }
  443. extern __inline __m256i
  444. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  445. _mm256_cvtepu32_epi64 (__m128i __X)
  446. {
  447. return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
  448. }
  449. extern __inline __m256i
  450. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  451. _mm256_mul_epi32 (__m256i __X, __m256i __Y)
  452. {
  453. return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
  454. }
  455. extern __inline __m256i
  456. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  457. _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
  458. {
  459. return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
  460. (__v16hi)__Y);
  461. }
  462. extern __inline __m256i
  463. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  464. _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
  465. {
  466. return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
  467. }
  468. extern __inline __m256i
  469. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  470. _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
  471. {
  472. return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
  473. }
  474. extern __inline __m256i
  475. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  476. _mm256_mullo_epi16 (__m256i __A, __m256i __B)
  477. {
  478. return (__m256i) ((__v16hu)__A * (__v16hu)__B);
  479. }
  480. extern __inline __m256i
  481. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  482. _mm256_mullo_epi32 (__m256i __A, __m256i __B)
  483. {
  484. return (__m256i) ((__v8su)__A * (__v8su)__B);
  485. }
  486. extern __inline __m256i
  487. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  488. _mm256_mul_epu32 (__m256i __A, __m256i __B)
  489. {
  490. return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
  491. }
  492. extern __inline __m256i
  493. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  494. _mm256_or_si256 (__m256i __A, __m256i __B)
  495. {
  496. return (__m256i) ((__v4du)__A | (__v4du)__B);
  497. }
  498. extern __inline __m256i
  499. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  500. _mm256_sad_epu8 (__m256i __A, __m256i __B)
  501. {
  502. return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
  503. }
  504. extern __inline __m256i
  505. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  506. _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
  507. {
  508. return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
  509. (__v32qi)__Y);
  510. }
  511. #ifdef __OPTIMIZE__
  512. extern __inline __m256i
  513. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  514. _mm256_shuffle_epi32 (__m256i __A, const int __mask)
  515. {
  516. return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
  517. }
  518. extern __inline __m256i
  519. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  520. _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
  521. {
  522. return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
  523. }
  524. extern __inline __m256i
  525. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  526. _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
  527. {
  528. return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
  529. }
  530. #else
  531. #define _mm256_shuffle_epi32(A, N) \
  532. ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
  533. #define _mm256_shufflehi_epi16(A, N) \
  534. ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
  535. #define _mm256_shufflelo_epi16(A, N) \
  536. ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
  537. #endif
  538. extern __inline __m256i
  539. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  540. _mm256_sign_epi8 (__m256i __X, __m256i __Y)
  541. {
  542. return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
  543. }
  544. extern __inline __m256i
  545. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  546. _mm256_sign_epi16 (__m256i __X, __m256i __Y)
  547. {
  548. return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
  549. }
  550. extern __inline __m256i
  551. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  552. _mm256_sign_epi32 (__m256i __X, __m256i __Y)
  553. {
  554. return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
  555. }
  556. #ifdef __OPTIMIZE__
  557. extern __inline __m256i
  558. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  559. _mm256_bslli_epi128 (__m256i __A, const int __N)
  560. {
  561. return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
  562. }
  563. extern __inline __m256i
  564. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  565. _mm256_slli_si256 (__m256i __A, const int __N)
  566. {
  567. return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
  568. }
  569. #else
  570. #define _mm256_bslli_epi128(A, N) \
  571. ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
  572. #define _mm256_slli_si256(A, N) \
  573. ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
  574. #endif
  575. extern __inline __m256i
  576. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  577. _mm256_slli_epi16 (__m256i __A, int __B)
  578. {
  579. return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
  580. }
  581. extern __inline __m256i
  582. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  583. _mm256_sll_epi16 (__m256i __A, __m128i __B)
  584. {
  585. return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
  586. }
  587. extern __inline __m256i
  588. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  589. _mm256_slli_epi32 (__m256i __A, int __B)
  590. {
  591. return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
  592. }
  593. extern __inline __m256i
  594. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  595. _mm256_sll_epi32 (__m256i __A, __m128i __B)
  596. {
  597. return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
  598. }
  599. extern __inline __m256i
  600. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  601. _mm256_slli_epi64 (__m256i __A, int __B)
  602. {
  603. return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
  604. }
  605. extern __inline __m256i
  606. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  607. _mm256_sll_epi64 (__m256i __A, __m128i __B)
  608. {
  609. return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
  610. }
  611. extern __inline __m256i
  612. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  613. _mm256_srai_epi16 (__m256i __A, int __B)
  614. {
  615. return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
  616. }
  617. extern __inline __m256i
  618. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  619. _mm256_sra_epi16 (__m256i __A, __m128i __B)
  620. {
  621. return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
  622. }
  623. extern __inline __m256i
  624. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  625. _mm256_srai_epi32 (__m256i __A, int __B)
  626. {
  627. return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
  628. }
  629. extern __inline __m256i
  630. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  631. _mm256_sra_epi32 (__m256i __A, __m128i __B)
  632. {
  633. return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
  634. }
  635. #ifdef __OPTIMIZE__
  636. extern __inline __m256i
  637. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  638. _mm256_bsrli_epi128 (__m256i __A, const int __N)
  639. {
  640. return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
  641. }
  642. extern __inline __m256i
  643. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  644. _mm256_srli_si256 (__m256i __A, const int __N)
  645. {
  646. return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
  647. }
  648. #else
  649. #define _mm256_bsrli_epi128(A, N) \
  650. ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
  651. #define _mm256_srli_si256(A, N) \
  652. ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
  653. #endif
  654. extern __inline __m256i
  655. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  656. _mm256_srli_epi16 (__m256i __A, int __B)
  657. {
  658. return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
  659. }
  660. extern __inline __m256i
  661. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  662. _mm256_srl_epi16 (__m256i __A, __m128i __B)
  663. {
  664. return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
  665. }
  666. extern __inline __m256i
  667. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  668. _mm256_srli_epi32 (__m256i __A, int __B)
  669. {
  670. return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
  671. }
  672. extern __inline __m256i
  673. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  674. _mm256_srl_epi32 (__m256i __A, __m128i __B)
  675. {
  676. return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
  677. }
  678. extern __inline __m256i
  679. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  680. _mm256_srli_epi64 (__m256i __A, int __B)
  681. {
  682. return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
  683. }
  684. extern __inline __m256i
  685. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  686. _mm256_srl_epi64 (__m256i __A, __m128i __B)
  687. {
  688. return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
  689. }
  690. extern __inline __m256i
  691. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  692. _mm256_sub_epi8 (__m256i __A, __m256i __B)
  693. {
  694. return (__m256i) ((__v32qu)__A - (__v32qu)__B);
  695. }
  696. extern __inline __m256i
  697. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  698. _mm256_sub_epi16 (__m256i __A, __m256i __B)
  699. {
  700. return (__m256i) ((__v16hu)__A - (__v16hu)__B);
  701. }
  702. extern __inline __m256i
  703. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  704. _mm256_sub_epi32 (__m256i __A, __m256i __B)
  705. {
  706. return (__m256i) ((__v8su)__A - (__v8su)__B);
  707. }
  708. extern __inline __m256i
  709. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  710. _mm256_sub_epi64 (__m256i __A, __m256i __B)
  711. {
  712. return (__m256i) ((__v4du)__A - (__v4du)__B);
  713. }
  714. extern __inline __m256i
  715. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  716. _mm256_subs_epi8 (__m256i __A, __m256i __B)
  717. {
  718. return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
  719. }
  720. extern __inline __m256i
  721. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  722. _mm256_subs_epi16 (__m256i __A, __m256i __B)
  723. {
  724. return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
  725. }
  726. extern __inline __m256i
  727. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  728. _mm256_subs_epu8 (__m256i __A, __m256i __B)
  729. {
  730. return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
  731. }
  732. extern __inline __m256i
  733. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  734. _mm256_subs_epu16 (__m256i __A, __m256i __B)
  735. {
  736. return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
  737. }
  738. extern __inline __m256i
  739. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  740. _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
  741. {
  742. return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
  743. }
  744. extern __inline __m256i
  745. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  746. _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
  747. {
  748. return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
  749. }
  750. extern __inline __m256i
  751. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  752. _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
  753. {
  754. return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
  755. }
  756. extern __inline __m256i
  757. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  758. _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
  759. {
  760. return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
  761. }
  762. extern __inline __m256i
  763. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  764. _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
  765. {
  766. return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
  767. }
  768. extern __inline __m256i
  769. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  770. _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
  771. {
  772. return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
  773. }
  774. extern __inline __m256i
  775. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  776. _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
  777. {
  778. return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
  779. }
  780. extern __inline __m256i
  781. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  782. _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
  783. {
  784. return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
  785. }
  786. extern __inline __m256i
  787. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  788. _mm256_xor_si256 (__m256i __A, __m256i __B)
  789. {
  790. return (__m256i) ((__v4du)__A ^ (__v4du)__B);
  791. }
  792. extern __inline __m256i
  793. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  794. _mm256_stream_load_si256 (__m256i const *__X)
  795. {
  796. return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
  797. }
  798. extern __inline __m128
  799. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  800. _mm_broadcastss_ps (__m128 __X)
  801. {
  802. return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
  803. }
  804. extern __inline __m256
  805. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  806. _mm256_broadcastss_ps (__m128 __X)
  807. {
  808. return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
  809. }
  810. extern __inline __m256d
  811. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  812. _mm256_broadcastsd_pd (__m128d __X)
  813. {
  814. return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
  815. }
  816. extern __inline __m256i
  817. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  818. _mm256_broadcastsi128_si256 (__m128i __X)
  819. {
  820. return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
  821. }
  822. #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
  823. #define _mm_broadcastsd_pd(X) _mm_movedup_pd(X)
  824. #ifdef __OPTIMIZE__
  825. extern __inline __m128i
  826. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  827. _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
  828. {
  829. return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
  830. (__v4si)__Y,
  831. __M);
  832. }
  833. #else
  834. #define _mm_blend_epi32(X, Y, M) \
  835. ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
  836. (__v4si)(__m128i)(Y), (int)(M)))
  837. #endif
  838. #ifdef __OPTIMIZE__
  839. extern __inline __m256i
  840. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  841. _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
  842. {
  843. return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
  844. (__v8si)__Y,
  845. __M);
  846. }
  847. #else
  848. #define _mm256_blend_epi32(X, Y, M) \
  849. ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
  850. (__v8si)(__m256i)(Y), (int)(M)))
  851. #endif
  852. extern __inline __m256i
  853. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  854. _mm256_broadcastb_epi8 (__m128i __X)
  855. {
  856. return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
  857. }
  858. extern __inline __m256i
  859. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  860. _mm256_broadcastw_epi16 (__m128i __X)
  861. {
  862. return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
  863. }
  864. extern __inline __m256i
  865. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  866. _mm256_broadcastd_epi32 (__m128i __X)
  867. {
  868. return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
  869. }
  870. extern __inline __m256i
  871. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  872. _mm256_broadcastq_epi64 (__m128i __X)
  873. {
  874. return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
  875. }
  876. extern __inline __m128i
  877. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  878. _mm_broadcastb_epi8 (__m128i __X)
  879. {
  880. return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
  881. }
  882. extern __inline __m128i
  883. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  884. _mm_broadcastw_epi16 (__m128i __X)
  885. {
  886. return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
  887. }
  888. extern __inline __m128i
  889. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  890. _mm_broadcastd_epi32 (__m128i __X)
  891. {
  892. return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
  893. }
  894. extern __inline __m128i
  895. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  896. _mm_broadcastq_epi64 (__m128i __X)
  897. {
  898. return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
  899. }
  900. extern __inline __m256i
  901. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  902. _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
  903. {
  904. return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
  905. }
  906. #ifdef __OPTIMIZE__
  907. extern __inline __m256d
  908. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  909. _mm256_permute4x64_pd (__m256d __X, const int __M)
  910. {
  911. return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
  912. }
  913. #else
  914. #define _mm256_permute4x64_pd(X, M) \
  915. ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
  916. #endif
  917. extern __inline __m256
  918. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  919. _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
  920. {
  921. return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
  922. }
  923. #ifdef __OPTIMIZE__
  924. extern __inline __m256i
  925. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  926. _mm256_permute4x64_epi64 (__m256i __X, const int __M)
  927. {
  928. return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
  929. }
  930. #else
  931. #define _mm256_permute4x64_epi64(X, M) \
  932. ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
  933. #endif
  934. #ifdef __OPTIMIZE__
  935. extern __inline __m256i
  936. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  937. _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
  938. {
  939. return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
  940. }
  941. #else
  942. #define _mm256_permute2x128_si256(X, Y, M) \
  943. ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
  944. #endif
  945. #ifdef __OPTIMIZE__
  946. extern __inline __m128i
  947. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  948. _mm256_extracti128_si256 (__m256i __X, const int __M)
  949. {
  950. return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
  951. }
  952. #else
  953. #define _mm256_extracti128_si256(X, M) \
  954. ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
  955. #endif
  956. #ifdef __OPTIMIZE__
  957. extern __inline __m256i
  958. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  959. _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
  960. {
  961. return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
  962. }
  963. #else
  964. #define _mm256_inserti128_si256(X, Y, M) \
  965. ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
  966. (__v2di)(__m128i)(Y), \
  967. (int)(M)))
  968. #endif
  969. extern __inline __m256i
  970. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  971. _mm256_maskload_epi32 (int const *__X, __m256i __M )
  972. {
  973. return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
  974. (__v8si)__M);
  975. }
  976. extern __inline __m256i
  977. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  978. _mm256_maskload_epi64 (long long const *__X, __m256i __M )
  979. {
  980. return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
  981. (__v4di)__M);
  982. }
  983. extern __inline __m128i
  984. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  985. _mm_maskload_epi32 (int const *__X, __m128i __M )
  986. {
  987. return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
  988. (__v4si)__M);
  989. }
  990. extern __inline __m128i
  991. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  992. _mm_maskload_epi64 (long long const *__X, __m128i __M )
  993. {
  994. return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
  995. (__v2di)__M);
  996. }
  997. extern __inline void
  998. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  999. _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
  1000. {
  1001. __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
  1002. }
  1003. extern __inline void
  1004. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1005. _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
  1006. {
  1007. __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
  1008. }
  1009. extern __inline void
  1010. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1011. _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
  1012. {
  1013. __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
  1014. }
  1015. extern __inline void
  1016. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1017. _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
  1018. {
  1019. __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
  1020. }
  1021. extern __inline __m256i
  1022. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1023. _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
  1024. {
  1025. return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
  1026. }
  1027. extern __inline __m128i
  1028. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1029. _mm_sllv_epi32 (__m128i __X, __m128i __Y)
  1030. {
  1031. return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
  1032. }
  1033. extern __inline __m256i
  1034. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1035. _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
  1036. {
  1037. return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
  1038. }
  1039. extern __inline __m128i
  1040. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1041. _mm_sllv_epi64 (__m128i __X, __m128i __Y)
  1042. {
  1043. return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
  1044. }
  1045. extern __inline __m256i
  1046. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1047. _mm256_srav_epi32 (__m256i __X, __m256i __Y)
  1048. {
  1049. return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
  1050. }
  1051. extern __inline __m128i
  1052. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1053. _mm_srav_epi32 (__m128i __X, __m128i __Y)
  1054. {
  1055. return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
  1056. }
  1057. extern __inline __m256i
  1058. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1059. _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
  1060. {
  1061. return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
  1062. }
  1063. extern __inline __m128i
  1064. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1065. _mm_srlv_epi32 (__m128i __X, __m128i __Y)
  1066. {
  1067. return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
  1068. }
  1069. extern __inline __m256i
  1070. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1071. _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
  1072. {
  1073. return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
  1074. }
  1075. extern __inline __m128i
  1076. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1077. _mm_srlv_epi64 (__m128i __X, __m128i __Y)
  1078. {
  1079. return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
  1080. }
  1081. #ifdef __OPTIMIZE__
  1082. extern __inline __m128d
  1083. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1084. _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
  1085. {
  1086. __v2df __zero = _mm_setzero_pd ();
  1087. __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
  1088. return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
  1089. __base,
  1090. (__v4si)__index,
  1091. __mask,
  1092. __scale);
  1093. }
  1094. extern __inline __m128d
  1095. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1096. _mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
  1097. __m128d __mask, const int __scale)
  1098. {
  1099. return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
  1100. __base,
  1101. (__v4si)__index,
  1102. (__v2df)__mask,
  1103. __scale);
  1104. }
  1105. extern __inline __m256d
  1106. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1107. _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
  1108. {
  1109. __v4df __zero = _mm256_setzero_pd ();
  1110. __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
  1111. return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
  1112. __base,
  1113. (__v4si)__index,
  1114. __mask,
  1115. __scale);
  1116. }
  1117. extern __inline __m256d
  1118. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1119. _mm256_mask_i32gather_pd (__m256d __src, double const *__base,
  1120. __m128i __index, __m256d __mask, const int __scale)
  1121. {
  1122. return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
  1123. __base,
  1124. (__v4si)__index,
  1125. (__v4df)__mask,
  1126. __scale);
  1127. }
  1128. extern __inline __m128d
  1129. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1130. _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
  1131. {
  1132. __v2df __src = _mm_setzero_pd ();
  1133. __v2df __mask = _mm_cmpeq_pd (__src, __src);
  1134. return (__m128d) __builtin_ia32_gatherdiv2df (__src,
  1135. __base,
  1136. (__v2di)__index,
  1137. __mask,
  1138. __scale);
  1139. }
  1140. extern __inline __m128d
  1141. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1142. _mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
  1143. __m128d __mask, const int __scale)
  1144. {
  1145. return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
  1146. __base,
  1147. (__v2di)__index,
  1148. (__v2df)__mask,
  1149. __scale);
  1150. }
  1151. extern __inline __m256d
  1152. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1153. _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
  1154. {
  1155. __v4df __src = _mm256_setzero_pd ();
  1156. __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
  1157. return (__m256d) __builtin_ia32_gatherdiv4df (__src,
  1158. __base,
  1159. (__v4di)__index,
  1160. __mask,
  1161. __scale);
  1162. }
  1163. extern __inline __m256d
  1164. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1165. _mm256_mask_i64gather_pd (__m256d __src, double const *__base,
  1166. __m256i __index, __m256d __mask, const int __scale)
  1167. {
  1168. return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
  1169. __base,
  1170. (__v4di)__index,
  1171. (__v4df)__mask,
  1172. __scale);
  1173. }
  1174. extern __inline __m128
  1175. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1176. _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
  1177. {
  1178. __v4sf __src = _mm_setzero_ps ();
  1179. __v4sf __mask = _mm_cmpeq_ps (__src, __src);
  1180. return (__m128) __builtin_ia32_gathersiv4sf (__src,
  1181. __base,
  1182. (__v4si)__index,
  1183. __mask,
  1184. __scale);
  1185. }
  1186. extern __inline __m128
  1187. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1188. _mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
  1189. __m128 __mask, const int __scale)
  1190. {
  1191. return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
  1192. __base,
  1193. (__v4si)__index,
  1194. (__v4sf)__mask,
  1195. __scale);
  1196. }
  1197. extern __inline __m256
  1198. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1199. _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
  1200. {
  1201. __v8sf __src = _mm256_setzero_ps ();
  1202. __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
  1203. return (__m256) __builtin_ia32_gathersiv8sf (__src,
  1204. __base,
  1205. (__v8si)__index,
  1206. __mask,
  1207. __scale);
  1208. }
  1209. extern __inline __m256
  1210. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1211. _mm256_mask_i32gather_ps (__m256 __src, float const *__base,
  1212. __m256i __index, __m256 __mask, const int __scale)
  1213. {
  1214. return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
  1215. __base,
  1216. (__v8si)__index,
  1217. (__v8sf)__mask,
  1218. __scale);
  1219. }
  1220. extern __inline __m128
  1221. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1222. _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
  1223. {
  1224. __v4sf __src = _mm_setzero_ps ();
  1225. __v4sf __mask = _mm_cmpeq_ps (__src, __src);
  1226. return (__m128) __builtin_ia32_gatherdiv4sf (__src,
  1227. __base,
  1228. (__v2di)__index,
  1229. __mask,
  1230. __scale);
  1231. }
  1232. extern __inline __m128
  1233. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1234. _mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
  1235. __m128 __mask, const int __scale)
  1236. {
  1237. return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
  1238. __base,
  1239. (__v2di)__index,
  1240. (__v4sf)__mask,
  1241. __scale);
  1242. }
  1243. extern __inline __m128
  1244. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1245. _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
  1246. {
  1247. __v4sf __src = _mm_setzero_ps ();
  1248. __v4sf __mask = _mm_cmpeq_ps (__src, __src);
  1249. return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
  1250. __base,
  1251. (__v4di)__index,
  1252. __mask,
  1253. __scale);
  1254. }
  1255. extern __inline __m128
  1256. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1257. _mm256_mask_i64gather_ps (__m128 __src, float const *__base,
  1258. __m256i __index, __m128 __mask, const int __scale)
  1259. {
  1260. return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
  1261. __base,
  1262. (__v4di)__index,
  1263. (__v4sf)__mask,
  1264. __scale);
  1265. }
  1266. extern __inline __m128i
  1267. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1268. _mm_i32gather_epi64 (long long int const *__base,
  1269. __m128i __index, const int __scale)
  1270. {
  1271. __v2di __src = __extension__ (__v2di){ 0, 0 };
  1272. __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
  1273. return (__m128i) __builtin_ia32_gathersiv2di (__src,
  1274. __base,
  1275. (__v4si)__index,
  1276. __mask,
  1277. __scale);
  1278. }
  1279. extern __inline __m128i
  1280. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1281. _mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
  1282. __m128i __index, __m128i __mask, const int __scale)
  1283. {
  1284. return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
  1285. __base,
  1286. (__v4si)__index,
  1287. (__v2di)__mask,
  1288. __scale);
  1289. }
  1290. extern __inline __m256i
  1291. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1292. _mm256_i32gather_epi64 (long long int const *__base,
  1293. __m128i __index, const int __scale)
  1294. {
  1295. __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
  1296. __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
  1297. return (__m256i) __builtin_ia32_gathersiv4di (__src,
  1298. __base,
  1299. (__v4si)__index,
  1300. __mask,
  1301. __scale);
  1302. }
  1303. extern __inline __m256i
  1304. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1305. _mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
  1306. __m128i __index, __m256i __mask,
  1307. const int __scale)
  1308. {
  1309. return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
  1310. __base,
  1311. (__v4si)__index,
  1312. (__v4di)__mask,
  1313. __scale);
  1314. }
  1315. extern __inline __m128i
  1316. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1317. _mm_i64gather_epi64 (long long int const *__base,
  1318. __m128i __index, const int __scale)
  1319. {
  1320. __v2di __src = __extension__ (__v2di){ 0, 0 };
  1321. __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
  1322. return (__m128i) __builtin_ia32_gatherdiv2di (__src,
  1323. __base,
  1324. (__v2di)__index,
  1325. __mask,
  1326. __scale);
  1327. }
  1328. extern __inline __m128i
  1329. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1330. _mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
  1331. __m128i __index, __m128i __mask, const int __scale)
  1332. {
  1333. return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
  1334. __base,
  1335. (__v2di)__index,
  1336. (__v2di)__mask,
  1337. __scale);
  1338. }
  1339. extern __inline __m256i
  1340. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1341. _mm256_i64gather_epi64 (long long int const *__base,
  1342. __m256i __index, const int __scale)
  1343. {
  1344. __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
  1345. __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
  1346. return (__m256i) __builtin_ia32_gatherdiv4di (__src,
  1347. __base,
  1348. (__v4di)__index,
  1349. __mask,
  1350. __scale);
  1351. }
  1352. extern __inline __m256i
  1353. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1354. _mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
  1355. __m256i __index, __m256i __mask,
  1356. const int __scale)
  1357. {
  1358. return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
  1359. __base,
  1360. (__v4di)__index,
  1361. (__v4di)__mask,
  1362. __scale);
  1363. }
  1364. extern __inline __m128i
  1365. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1366. _mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
  1367. {
  1368. __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
  1369. __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
  1370. return (__m128i) __builtin_ia32_gathersiv4si (__src,
  1371. __base,
  1372. (__v4si)__index,
  1373. __mask,
  1374. __scale);
  1375. }
  1376. extern __inline __m128i
  1377. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1378. _mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
  1379. __m128i __mask, const int __scale)
  1380. {
  1381. return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
  1382. __base,
  1383. (__v4si)__index,
  1384. (__v4si)__mask,
  1385. __scale);
  1386. }
  1387. extern __inline __m256i
  1388. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1389. _mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
  1390. {
  1391. __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
  1392. __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
  1393. return (__m256i) __builtin_ia32_gathersiv8si (__src,
  1394. __base,
  1395. (__v8si)__index,
  1396. __mask,
  1397. __scale);
  1398. }
  1399. extern __inline __m256i
  1400. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1401. _mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
  1402. __m256i __index, __m256i __mask,
  1403. const int __scale)
  1404. {
  1405. return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
  1406. __base,
  1407. (__v8si)__index,
  1408. (__v8si)__mask,
  1409. __scale);
  1410. }
  1411. extern __inline __m128i
  1412. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1413. _mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
  1414. {
  1415. __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
  1416. __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
  1417. return (__m128i) __builtin_ia32_gatherdiv4si (__src,
  1418. __base,
  1419. (__v2di)__index,
  1420. __mask,
  1421. __scale);
  1422. }
  1423. extern __inline __m128i
  1424. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1425. _mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
  1426. __m128i __mask, const int __scale)
  1427. {
  1428. return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
  1429. __base,
  1430. (__v2di)__index,
  1431. (__v4si)__mask,
  1432. __scale);
  1433. }
  1434. extern __inline __m128i
  1435. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1436. _mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
  1437. {
  1438. __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
  1439. __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
  1440. return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
  1441. __base,
  1442. (__v4di)__index,
  1443. __mask,
  1444. __scale);
  1445. }
  1446. extern __inline __m128i
  1447. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  1448. _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
  1449. __m256i __index, __m128i __mask,
  1450. const int __scale)
  1451. {
  1452. return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
  1453. __base,
  1454. (__v4di)__index,
  1455. (__v4si)__mask,
  1456. __scale);
  1457. }
  1458. #else /* __OPTIMIZE__ */
  1459. #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
  1460. (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
  1461. (double const *) (BASE), \
  1462. (__v4si)(__m128i) (INDEX), \
  1463. (__v2df) \
  1464. _mm_cmpeq_pd (_mm_setzero_pd (),\
  1465. _mm_setzero_pd ()),\
  1466. (int) (SCALE))
  1467. #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1468. (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC), \
  1469. (double const *) (BASE), \
  1470. (__v4si)(__m128i) (INDEX), \
  1471. (__v2df)(__m128d) (MASK), \
  1472. (int) (SCALE))
  1473. #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
  1474. (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
  1475. (double const *) (BASE), \
  1476. (__v4si)(__m128i) (INDEX), \
  1477. (__v4df) \
  1478. _mm256_cmp_pd (_mm256_setzero_pd (),\
  1479. _mm256_setzero_pd (),\
  1480. _CMP_EQ_OQ), \
  1481. (int) (SCALE))
  1482. #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1483. (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC), \
  1484. (double const *) (BASE), \
  1485. (__v4si)(__m128i) (INDEX), \
  1486. (__v4df)(__m256d) (MASK), \
  1487. (int) (SCALE))
  1488. #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
  1489. (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
  1490. (double const *) (BASE), \
  1491. (__v2di)(__m128i) (INDEX), \
  1492. (__v2df) \
  1493. _mm_cmpeq_pd (_mm_setzero_pd (),\
  1494. _mm_setzero_pd ()),\
  1495. (int) (SCALE))
  1496. #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1497. (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC), \
  1498. (double const *) (BASE), \
  1499. (__v2di)(__m128i) (INDEX), \
  1500. (__v2df)(__m128d) (MASK), \
  1501. (int) (SCALE))
  1502. #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
  1503. (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
  1504. (double const *) (BASE), \
  1505. (__v4di)(__m256i) (INDEX), \
  1506. (__v4df) \
  1507. _mm256_cmp_pd (_mm256_setzero_pd (),\
  1508. _mm256_setzero_pd (),\
  1509. _CMP_EQ_OQ), \
  1510. (int) (SCALE))
  1511. #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
  1512. (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC), \
  1513. (double const *) (BASE), \
  1514. (__v4di)(__m256i) (INDEX), \
  1515. (__v4df)(__m256d) (MASK), \
  1516. (int) (SCALE))
  1517. #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
  1518. (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
  1519. (float const *) (BASE), \
  1520. (__v4si)(__m128i) (INDEX), \
  1521. (__v4sf) \
  1522. _mm_cmpeq_ps (_mm_setzero_ps (),\
  1523. _mm_setzero_ps ()),\
  1524. (int) (SCALE))
  1525. #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1526. (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC), \
  1527. (float const *) (BASE), \
  1528. (__v4si)(__m128i) (INDEX), \
  1529. (__v4sf)(__m128) (MASK), \
  1530. (int) (SCALE))
  1531. #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
  1532. (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
  1533. (float const *) (BASE), \
  1534. (__v8si)(__m256i) (INDEX), \
  1535. (__v8sf) \
  1536. _mm256_cmp_ps (_mm256_setzero_ps (),\
  1537. _mm256_setzero_ps (),\
  1538. _CMP_EQ_OQ), \
  1539. (int) (SCALE))
  1540. #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1541. (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC), \
  1542. (float const *) (BASE), \
  1543. (__v8si)(__m256i) (INDEX), \
  1544. (__v8sf)(__m256) (MASK), \
  1545. (int) (SCALE))
  1546. #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
  1547. (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
  1548. (float const *) (BASE), \
  1549. (__v2di)(__m128i) (INDEX), \
  1550. (__v4sf) \
  1551. _mm_cmpeq_ps (_mm_setzero_ps (),\
  1552. _mm_setzero_ps ()),\
  1553. (int) (SCALE))
  1554. #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1555. (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC), \
  1556. (float const *) (BASE), \
  1557. (__v2di)(__m128i) (INDEX), \
  1558. (__v4sf)(__m128) (MASK), \
  1559. (int) (SCALE))
  1560. #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
  1561. (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
  1562. (float const *) (BASE), \
  1563. (__v4di)(__m256i) (INDEX), \
  1564. (__v4sf) \
  1565. _mm_cmpeq_ps (_mm_setzero_ps (),\
  1566. _mm_setzero_ps ()),\
  1567. (int) (SCALE))
  1568. #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
  1569. (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC), \
  1570. (float const *) (BASE), \
  1571. (__v4di)(__m256i) (INDEX), \
  1572. (__v4sf)(__m128) (MASK), \
  1573. (int) (SCALE))
  1574. #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
  1575. (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
  1576. (long long const *) (BASE), \
  1577. (__v4si)(__m128i) (INDEX), \
  1578. (__v2di)_mm_set1_epi64x (-1), \
  1579. (int) (SCALE))
  1580. #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1581. (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC), \
  1582. (long long const *) (BASE), \
  1583. (__v4si)(__m128i) (INDEX), \
  1584. (__v2di)(__m128i) (MASK), \
  1585. (int) (SCALE))
  1586. #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
  1587. (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
  1588. (long long const *) (BASE), \
  1589. (__v4si)(__m128i) (INDEX), \
  1590. (__v4di)_mm256_set1_epi64x (-1), \
  1591. (int) (SCALE))
  1592. #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1593. (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC), \
  1594. (long long const *) (BASE), \
  1595. (__v4si)(__m128i) (INDEX), \
  1596. (__v4di)(__m256i) (MASK), \
  1597. (int) (SCALE))
  1598. #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
  1599. (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
  1600. (long long const *) (BASE), \
  1601. (__v2di)(__m128i) (INDEX), \
  1602. (__v2di)_mm_set1_epi64x (-1), \
  1603. (int) (SCALE))
  1604. #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1605. (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC), \
  1606. (long long const *) (BASE), \
  1607. (__v2di)(__m128i) (INDEX), \
  1608. (__v2di)(__m128i) (MASK), \
  1609. (int) (SCALE))
  1610. #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
  1611. (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
  1612. (long long const *) (BASE), \
  1613. (__v4di)(__m256i) (INDEX), \
  1614. (__v4di)_mm256_set1_epi64x (-1), \
  1615. (int) (SCALE))
  1616. #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
  1617. (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC), \
  1618. (long long const *) (BASE), \
  1619. (__v4di)(__m256i) (INDEX), \
  1620. (__v4di)(__m256i) (MASK), \
  1621. (int) (SCALE))
  1622. #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
  1623. (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
  1624. (int const *) (BASE), \
  1625. (__v4si)(__m128i) (INDEX), \
  1626. (__v4si)_mm_set1_epi32 (-1), \
  1627. (int) (SCALE))
  1628. #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1629. (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC), \
  1630. (int const *) (BASE), \
  1631. (__v4si)(__m128i) (INDEX), \
  1632. (__v4si)(__m128i) (MASK), \
  1633. (int) (SCALE))
  1634. #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
  1635. (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
  1636. (int const *) (BASE), \
  1637. (__v8si)(__m256i) (INDEX), \
  1638. (__v8si)_mm256_set1_epi32 (-1), \
  1639. (int) (SCALE))
  1640. #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1641. (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC), \
  1642. (int const *) (BASE), \
  1643. (__v8si)(__m256i) (INDEX), \
  1644. (__v8si)(__m256i) (MASK), \
  1645. (int) (SCALE))
  1646. #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
  1647. (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
  1648. (int const *) (BASE), \
  1649. (__v2di)(__m128i) (INDEX), \
  1650. (__v4si)_mm_set1_epi32 (-1), \
  1651. (int) (SCALE))
  1652. #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1653. (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC), \
  1654. (int const *) (BASE), \
  1655. (__v2di)(__m128i) (INDEX), \
  1656. (__v4si)(__m128i) (MASK), \
  1657. (int) (SCALE))
  1658. #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
  1659. (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
  1660. (int const *) (BASE), \
  1661. (__v4di)(__m256i) (INDEX), \
  1662. (__v4si)_mm_set1_epi32(-1), \
  1663. (int) (SCALE))
  1664. #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
  1665. (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), \
  1666. (int const *) (BASE), \
  1667. (__v4di)(__m256i) (INDEX), \
  1668. (__v4si)(__m128i) (MASK), \
  1669. (int) (SCALE))
  1670. #endif /* __OPTIMIZE__ */
  1671. #ifdef __DISABLE_AVX2__
  1672. #undef __DISABLE_AVX2__
  1673. #pragma GCC pop_options
  1674. #endif /* __DISABLE_AVX2__ */
  1675. #endif /* _AVX2INTRIN_H_INCLUDED */