avxintrin.h 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529
  1. /* Copyright (C) 2008-2019 Free Software Foundation, Inc.
  2. This file is part of GCC.
  3. GCC is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 3, or (at your option)
  6. any later version.
  7. GCC is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. Under Section 7 of GPL version 3, you are granted additional
  12. permissions described in the GCC Runtime Library Exception, version
  13. 3.1, as published by the Free Software Foundation.
  14. You should have received a copy of the GNU General Public License and
  15. a copy of the GCC Runtime Library Exception along with this program;
  16. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  17. <http://www.gnu.org/licenses/>. */
  18. /* Implemented from the specification included in the Intel C++ Compiler
  19. User Guide and Reference, version 11.0. */
  20. #ifndef _IMMINTRIN_H_INCLUDED
  21. # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
  22. #endif
  23. #ifndef _AVXINTRIN_H_INCLUDED
  24. #define _AVXINTRIN_H_INCLUDED
  25. #ifndef __AVX__
  26. #pragma GCC push_options
  27. #pragma GCC target("avx")
  28. #define __DISABLE_AVX__
  29. #endif /* __AVX__ */
  30. /* Internal data types for implementing the intrinsics. */
  31. typedef double __v4df __attribute__ ((__vector_size__ (32)));
  32. typedef float __v8sf __attribute__ ((__vector_size__ (32)));
  33. typedef long long __v4di __attribute__ ((__vector_size__ (32)));
  34. typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
  35. typedef int __v8si __attribute__ ((__vector_size__ (32)));
  36. typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
  37. typedef short __v16hi __attribute__ ((__vector_size__ (32)));
  38. typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
  39. typedef char __v32qi __attribute__ ((__vector_size__ (32)));
  40. typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
  41. typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
  42. /* The Intel API is flexible enough that we must allow aliasing with other
  43. vector types, and their scalar components. */
  44. typedef float __m256 __attribute__ ((__vector_size__ (32),
  45. __may_alias__));
  46. typedef long long __m256i __attribute__ ((__vector_size__ (32),
  47. __may_alias__));
  48. typedef double __m256d __attribute__ ((__vector_size__ (32),
  49. __may_alias__));
  50. /* Unaligned version of the same types. */
  51. typedef float __m256_u __attribute__ ((__vector_size__ (32),
  52. __may_alias__,
  53. __aligned__ (1)));
  54. typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
  55. __may_alias__,
  56. __aligned__ (1)));
  57. typedef double __m256d_u __attribute__ ((__vector_size__ (32),
  58. __may_alias__,
  59. __aligned__ (1)));
  60. /* Compare predicates for scalar and packed compare intrinsics. */
  61. /* Equal (ordered, non-signaling) */
  62. #define _CMP_EQ_OQ 0x00
  63. /* Less-than (ordered, signaling) */
  64. #define _CMP_LT_OS 0x01
  65. /* Less-than-or-equal (ordered, signaling) */
  66. #define _CMP_LE_OS 0x02
  67. /* Unordered (non-signaling) */
  68. #define _CMP_UNORD_Q 0x03
  69. /* Not-equal (unordered, non-signaling) */
  70. #define _CMP_NEQ_UQ 0x04
  71. /* Not-less-than (unordered, signaling) */
  72. #define _CMP_NLT_US 0x05
  73. /* Not-less-than-or-equal (unordered, signaling) */
  74. #define _CMP_NLE_US 0x06
  75. /* Ordered (nonsignaling) */
  76. #define _CMP_ORD_Q 0x07
  77. /* Equal (unordered, non-signaling) */
  78. #define _CMP_EQ_UQ 0x08
  79. /* Not-greater-than-or-equal (unordered, signaling) */
  80. #define _CMP_NGE_US 0x09
  81. /* Not-greater-than (unordered, signaling) */
  82. #define _CMP_NGT_US 0x0a
  83. /* False (ordered, non-signaling) */
  84. #define _CMP_FALSE_OQ 0x0b
  85. /* Not-equal (ordered, non-signaling) */
  86. #define _CMP_NEQ_OQ 0x0c
  87. /* Greater-than-or-equal (ordered, signaling) */
  88. #define _CMP_GE_OS 0x0d
  89. /* Greater-than (ordered, signaling) */
  90. #define _CMP_GT_OS 0x0e
  91. /* True (unordered, non-signaling) */
  92. #define _CMP_TRUE_UQ 0x0f
  93. /* Equal (ordered, signaling) */
  94. #define _CMP_EQ_OS 0x10
  95. /* Less-than (ordered, non-signaling) */
  96. #define _CMP_LT_OQ 0x11
  97. /* Less-than-or-equal (ordered, non-signaling) */
  98. #define _CMP_LE_OQ 0x12
  99. /* Unordered (signaling) */
  100. #define _CMP_UNORD_S 0x13
  101. /* Not-equal (unordered, signaling) */
  102. #define _CMP_NEQ_US 0x14
  103. /* Not-less-than (unordered, non-signaling) */
  104. #define _CMP_NLT_UQ 0x15
  105. /* Not-less-than-or-equal (unordered, non-signaling) */
  106. #define _CMP_NLE_UQ 0x16
  107. /* Ordered (signaling) */
  108. #define _CMP_ORD_S 0x17
  109. /* Equal (unordered, signaling) */
  110. #define _CMP_EQ_US 0x18
  111. /* Not-greater-than-or-equal (unordered, non-signaling) */
  112. #define _CMP_NGE_UQ 0x19
  113. /* Not-greater-than (unordered, non-signaling) */
  114. #define _CMP_NGT_UQ 0x1a
  115. /* False (ordered, signaling) */
  116. #define _CMP_FALSE_OS 0x1b
  117. /* Not-equal (ordered, signaling) */
  118. #define _CMP_NEQ_OS 0x1c
  119. /* Greater-than-or-equal (ordered, non-signaling) */
  120. #define _CMP_GE_OQ 0x1d
  121. /* Greater-than (ordered, non-signaling) */
  122. #define _CMP_GT_OQ 0x1e
  123. /* True (unordered, signaling) */
  124. #define _CMP_TRUE_US 0x1f
  125. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  126. _mm256_add_pd (__m256d __A, __m256d __B)
  127. {
  128. return (__m256d) ((__v4df)__A + (__v4df)__B);
  129. }
  130. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  131. _mm256_add_ps (__m256 __A, __m256 __B)
  132. {
  133. return (__m256) ((__v8sf)__A + (__v8sf)__B);
  134. }
  135. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  136. _mm256_addsub_pd (__m256d __A, __m256d __B)
  137. {
  138. return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
  139. }
  140. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  141. _mm256_addsub_ps (__m256 __A, __m256 __B)
  142. {
  143. return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
  144. }
  145. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  146. _mm256_and_pd (__m256d __A, __m256d __B)
  147. {
  148. return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
  149. }
  150. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  151. _mm256_and_ps (__m256 __A, __m256 __B)
  152. {
  153. return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
  154. }
  155. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  156. _mm256_andnot_pd (__m256d __A, __m256d __B)
  157. {
  158. return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
  159. }
  160. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  161. _mm256_andnot_ps (__m256 __A, __m256 __B)
  162. {
  163. return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
  164. }
  165. /* Double/single precision floating point blend instructions - select
  166. data from 2 sources using constant/variable mask. */
  167. #ifdef __OPTIMIZE__
  168. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  169. _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
  170. {
  171. return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
  172. (__v4df)__Y,
  173. __M);
  174. }
  175. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  176. _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
  177. {
  178. return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
  179. (__v8sf)__Y,
  180. __M);
  181. }
  182. #else
  183. #define _mm256_blend_pd(X, Y, M) \
  184. ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
  185. (__v4df)(__m256d)(Y), (int)(M)))
  186. #define _mm256_blend_ps(X, Y, M) \
  187. ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
  188. (__v8sf)(__m256)(Y), (int)(M)))
  189. #endif
  190. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  191. _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
  192. {
  193. return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
  194. (__v4df)__Y,
  195. (__v4df)__M);
  196. }
  197. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  198. _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
  199. {
  200. return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
  201. (__v8sf)__Y,
  202. (__v8sf)__M);
  203. }
  204. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  205. _mm256_div_pd (__m256d __A, __m256d __B)
  206. {
  207. return (__m256d) ((__v4df)__A / (__v4df)__B);
  208. }
  209. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  210. _mm256_div_ps (__m256 __A, __m256 __B)
  211. {
  212. return (__m256) ((__v8sf)__A / (__v8sf)__B);
  213. }
  214. /* Dot product instructions with mask-defined summing and zeroing parts
  215. of result. */
  216. #ifdef __OPTIMIZE__
  217. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  218. _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
  219. {
  220. return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
  221. (__v8sf)__Y,
  222. __M);
  223. }
  224. #else
  225. #define _mm256_dp_ps(X, Y, M) \
  226. ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
  227. (__v8sf)(__m256)(Y), (int)(M)))
  228. #endif
  229. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  230. _mm256_hadd_pd (__m256d __X, __m256d __Y)
  231. {
  232. return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
  233. }
  234. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  235. _mm256_hadd_ps (__m256 __X, __m256 __Y)
  236. {
  237. return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
  238. }
  239. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  240. _mm256_hsub_pd (__m256d __X, __m256d __Y)
  241. {
  242. return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
  243. }
  244. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  245. _mm256_hsub_ps (__m256 __X, __m256 __Y)
  246. {
  247. return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
  248. }
  249. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  250. _mm256_max_pd (__m256d __A, __m256d __B)
  251. {
  252. return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
  253. }
  254. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  255. _mm256_max_ps (__m256 __A, __m256 __B)
  256. {
  257. return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
  258. }
  259. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  260. _mm256_min_pd (__m256d __A, __m256d __B)
  261. {
  262. return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
  263. }
  264. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  265. _mm256_min_ps (__m256 __A, __m256 __B)
  266. {
  267. return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
  268. }
  269. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  270. _mm256_mul_pd (__m256d __A, __m256d __B)
  271. {
  272. return (__m256d) ((__v4df)__A * (__v4df)__B);
  273. }
  274. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  275. _mm256_mul_ps (__m256 __A, __m256 __B)
  276. {
  277. return (__m256) ((__v8sf)__A * (__v8sf)__B);
  278. }
  279. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  280. _mm256_or_pd (__m256d __A, __m256d __B)
  281. {
  282. return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
  283. }
  284. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  285. _mm256_or_ps (__m256 __A, __m256 __B)
  286. {
  287. return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
  288. }
  289. #ifdef __OPTIMIZE__
  290. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  291. _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
  292. {
  293. return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
  294. __mask);
  295. }
  296. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  297. _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
  298. {
  299. return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
  300. __mask);
  301. }
  302. #else
  303. #define _mm256_shuffle_pd(A, B, N) \
  304. ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
  305. (__v4df)(__m256d)(B), (int)(N)))
  306. #define _mm256_shuffle_ps(A, B, N) \
  307. ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
  308. (__v8sf)(__m256)(B), (int)(N)))
  309. #endif
  310. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  311. _mm256_sub_pd (__m256d __A, __m256d __B)
  312. {
  313. return (__m256d) ((__v4df)__A - (__v4df)__B);
  314. }
  315. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  316. _mm256_sub_ps (__m256 __A, __m256 __B)
  317. {
  318. return (__m256) ((__v8sf)__A - (__v8sf)__B);
  319. }
  320. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  321. _mm256_xor_pd (__m256d __A, __m256d __B)
  322. {
  323. return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
  324. }
  325. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  326. _mm256_xor_ps (__m256 __A, __m256 __B)
  327. {
  328. return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
  329. }
  330. #ifdef __OPTIMIZE__
  331. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  332. _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
  333. {
  334. return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
  335. }
  336. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  337. _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
  338. {
  339. return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
  340. }
  341. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  342. _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
  343. {
  344. return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
  345. __P);
  346. }
  347. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  348. _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
  349. {
  350. return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
  351. __P);
  352. }
  353. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  354. _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
  355. {
  356. return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
  357. }
  358. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  359. _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
  360. {
  361. return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
  362. }
  363. #else
  364. #define _mm_cmp_pd(X, Y, P) \
  365. ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
  366. (__v2df)(__m128d)(Y), (int)(P)))
  367. #define _mm_cmp_ps(X, Y, P) \
  368. ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
  369. (__v4sf)(__m128)(Y), (int)(P)))
  370. #define _mm256_cmp_pd(X, Y, P) \
  371. ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
  372. (__v4df)(__m256d)(Y), (int)(P)))
  373. #define _mm256_cmp_ps(X, Y, P) \
  374. ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
  375. (__v8sf)(__m256)(Y), (int)(P)))
  376. #define _mm_cmp_sd(X, Y, P) \
  377. ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
  378. (__v2df)(__m128d)(Y), (int)(P)))
  379. #define _mm_cmp_ss(X, Y, P) \
  380. ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
  381. (__v4sf)(__m128)(Y), (int)(P)))
  382. #endif
  383. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  384. _mm256_cvtepi32_pd (__m128i __A)
  385. {
  386. return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
  387. }
  388. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  389. _mm256_cvtepi32_ps (__m256i __A)
  390. {
  391. return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
  392. }
  393. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  394. _mm256_cvtpd_ps (__m256d __A)
  395. {
  396. return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
  397. }
  398. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  399. _mm256_cvtps_epi32 (__m256 __A)
  400. {
  401. return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
  402. }
  403. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  404. _mm256_cvtps_pd (__m128 __A)
  405. {
  406. return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
  407. }
  408. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  409. _mm256_cvttpd_epi32 (__m256d __A)
  410. {
  411. return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
  412. }
  413. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  414. _mm256_cvtpd_epi32 (__m256d __A)
  415. {
  416. return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
  417. }
  418. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  419. _mm256_cvttps_epi32 (__m256 __A)
  420. {
  421. return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
  422. }
  423. extern __inline double
  424. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  425. _mm256_cvtsd_f64 (__m256d __A)
  426. {
  427. return __A[0];
  428. }
  429. extern __inline float
  430. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  431. _mm256_cvtss_f32 (__m256 __A)
  432. {
  433. return __A[0];
  434. }
  435. #ifdef __OPTIMIZE__
  436. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  437. _mm256_extractf128_pd (__m256d __X, const int __N)
  438. {
  439. return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
  440. }
  441. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  442. _mm256_extractf128_ps (__m256 __X, const int __N)
  443. {
  444. return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
  445. }
  446. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  447. _mm256_extractf128_si256 (__m256i __X, const int __N)
  448. {
  449. return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
  450. }
  451. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  452. _mm256_extract_epi32 (__m256i __X, int const __N)
  453. {
  454. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
  455. return _mm_extract_epi32 (__Y, __N % 4);
  456. }
  457. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  458. _mm256_extract_epi16 (__m256i __X, int const __N)
  459. {
  460. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
  461. return _mm_extract_epi16 (__Y, __N % 8);
  462. }
  463. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  464. _mm256_extract_epi8 (__m256i __X, int const __N)
  465. {
  466. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
  467. return _mm_extract_epi8 (__Y, __N % 16);
  468. }
  469. #ifdef __x86_64__
  470. extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  471. _mm256_extract_epi64 (__m256i __X, const int __N)
  472. {
  473. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
  474. return _mm_extract_epi64 (__Y, __N % 2);
  475. }
  476. #endif
  477. #else
  478. #define _mm256_extractf128_pd(X, N) \
  479. ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
  480. (int)(N)))
  481. #define _mm256_extractf128_ps(X, N) \
  482. ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
  483. (int)(N)))
  484. #define _mm256_extractf128_si256(X, N) \
  485. ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
  486. (int)(N)))
  487. #define _mm256_extract_epi32(X, N) \
  488. (__extension__ \
  489. ({ \
  490. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
  491. _mm_extract_epi32 (__Y, (N) % 4); \
  492. }))
  493. #define _mm256_extract_epi16(X, N) \
  494. (__extension__ \
  495. ({ \
  496. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
  497. _mm_extract_epi16 (__Y, (N) % 8); \
  498. }))
  499. #define _mm256_extract_epi8(X, N) \
  500. (__extension__ \
  501. ({ \
  502. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
  503. _mm_extract_epi8 (__Y, (N) % 16); \
  504. }))
  505. #ifdef __x86_64__
  506. #define _mm256_extract_epi64(X, N) \
  507. (__extension__ \
  508. ({ \
  509. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
  510. _mm_extract_epi64 (__Y, (N) % 2); \
  511. }))
  512. #endif
  513. #endif
  514. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  515. _mm256_zeroall (void)
  516. {
  517. __builtin_ia32_vzeroall ();
  518. }
  519. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  520. _mm256_zeroupper (void)
  521. {
  522. __builtin_ia32_vzeroupper ();
  523. }
  524. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  525. _mm_permutevar_pd (__m128d __A, __m128i __C)
  526. {
  527. return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
  528. (__v2di)__C);
  529. }
  530. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  531. _mm256_permutevar_pd (__m256d __A, __m256i __C)
  532. {
  533. return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
  534. (__v4di)__C);
  535. }
  536. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  537. _mm_permutevar_ps (__m128 __A, __m128i __C)
  538. {
  539. return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
  540. (__v4si)__C);
  541. }
  542. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  543. _mm256_permutevar_ps (__m256 __A, __m256i __C)
  544. {
  545. return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
  546. (__v8si)__C);
  547. }
  548. #ifdef __OPTIMIZE__
  549. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  550. _mm_permute_pd (__m128d __X, const int __C)
  551. {
  552. return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
  553. }
  554. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  555. _mm256_permute_pd (__m256d __X, const int __C)
  556. {
  557. return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
  558. }
  559. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  560. _mm_permute_ps (__m128 __X, const int __C)
  561. {
  562. return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
  563. }
  564. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  565. _mm256_permute_ps (__m256 __X, const int __C)
  566. {
  567. return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
  568. }
  569. #else
  570. #define _mm_permute_pd(X, C) \
  571. ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
  572. #define _mm256_permute_pd(X, C) \
  573. ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
  574. #define _mm_permute_ps(X, C) \
  575. ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
  576. #define _mm256_permute_ps(X, C) \
  577. ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
  578. #endif
  579. #ifdef __OPTIMIZE__
  580. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  581. _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
  582. {
  583. return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
  584. (__v4df)__Y,
  585. __C);
  586. }
  587. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  588. _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
  589. {
  590. return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
  591. (__v8sf)__Y,
  592. __C);
  593. }
  594. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  595. _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
  596. {
  597. return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
  598. (__v8si)__Y,
  599. __C);
  600. }
  601. #else
  602. #define _mm256_permute2f128_pd(X, Y, C) \
  603. ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
  604. (__v4df)(__m256d)(Y), \
  605. (int)(C)))
  606. #define _mm256_permute2f128_ps(X, Y, C) \
  607. ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
  608. (__v8sf)(__m256)(Y), \
  609. (int)(C)))
  610. #define _mm256_permute2f128_si256(X, Y, C) \
  611. ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
  612. (__v8si)(__m256i)(Y), \
  613. (int)(C)))
  614. #endif
  615. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  616. _mm_broadcast_ss (float const *__X)
  617. {
  618. return (__m128) __builtin_ia32_vbroadcastss (__X);
  619. }
  620. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  621. _mm256_broadcast_sd (double const *__X)
  622. {
  623. return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
  624. }
  625. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  626. _mm256_broadcast_ss (float const *__X)
  627. {
  628. return (__m256) __builtin_ia32_vbroadcastss256 (__X);
  629. }
  630. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  631. _mm256_broadcast_pd (__m128d const *__X)
  632. {
  633. return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
  634. }
  635. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  636. _mm256_broadcast_ps (__m128 const *__X)
  637. {
  638. return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
  639. }
  640. #ifdef __OPTIMIZE__
  641. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  642. _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
  643. {
  644. return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
  645. (__v2df)__Y,
  646. __O);
  647. }
  648. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  649. _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
  650. {
  651. return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
  652. (__v4sf)__Y,
  653. __O);
  654. }
  655. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  656. _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
  657. {
  658. return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
  659. (__v4si)__Y,
  660. __O);
  661. }
  662. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  663. _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
  664. {
  665. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
  666. __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
  667. return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
  668. }
  669. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  670. _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
  671. {
  672. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
  673. __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
  674. return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
  675. }
  676. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  677. _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
  678. {
  679. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
  680. __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
  681. return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
  682. }
  683. #ifdef __x86_64__
  684. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  685. _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
  686. {
  687. __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
  688. __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
  689. return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
  690. }
  691. #endif
  692. #else
  693. #define _mm256_insertf128_pd(X, Y, O) \
  694. ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
  695. (__v2df)(__m128d)(Y), \
  696. (int)(O)))
  697. #define _mm256_insertf128_ps(X, Y, O) \
  698. ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
  699. (__v4sf)(__m128)(Y), \
  700. (int)(O)))
  701. #define _mm256_insertf128_si256(X, Y, O) \
  702. ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
  703. (__v4si)(__m128i)(Y), \
  704. (int)(O)))
  705. #define _mm256_insert_epi32(X, D, N) \
  706. (__extension__ \
  707. ({ \
  708. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
  709. __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
  710. _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
  711. }))
  712. #define _mm256_insert_epi16(X, D, N) \
  713. (__extension__ \
  714. ({ \
  715. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
  716. __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
  717. _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
  718. }))
  719. #define _mm256_insert_epi8(X, D, N) \
  720. (__extension__ \
  721. ({ \
  722. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
  723. __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
  724. _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
  725. }))
  726. #ifdef __x86_64__
  727. #define _mm256_insert_epi64(X, D, N) \
  728. (__extension__ \
  729. ({ \
  730. __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
  731. __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
  732. _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
  733. }))
  734. #endif
  735. #endif
  736. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  737. _mm256_load_pd (double const *__P)
  738. {
  739. return *(__m256d *)__P;
  740. }
  741. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  742. _mm256_store_pd (double *__P, __m256d __A)
  743. {
  744. *(__m256d *)__P = __A;
  745. }
  746. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  747. _mm256_load_ps (float const *__P)
  748. {
  749. return *(__m256 *)__P;
  750. }
  751. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  752. _mm256_store_ps (float *__P, __m256 __A)
  753. {
  754. *(__m256 *)__P = __A;
  755. }
  756. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  757. _mm256_loadu_pd (double const *__P)
  758. {
  759. return *(__m256d_u *)__P;
  760. }
  761. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  762. _mm256_storeu_pd (double *__P, __m256d __A)
  763. {
  764. *(__m256d_u *)__P = __A;
  765. }
  766. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  767. _mm256_loadu_ps (float const *__P)
  768. {
  769. return *(__m256_u *)__P;
  770. }
  771. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  772. _mm256_storeu_ps (float *__P, __m256 __A)
  773. {
  774. *(__m256_u *)__P = __A;
  775. }
  776. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  777. _mm256_load_si256 (__m256i const *__P)
  778. {
  779. return *__P;
  780. }
  781. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  782. _mm256_store_si256 (__m256i *__P, __m256i __A)
  783. {
  784. *__P = __A;
  785. }
  786. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  787. _mm256_loadu_si256 (__m256i_u const *__P)
  788. {
  789. return *__P;
  790. }
  791. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  792. _mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
  793. {
  794. *__P = __A;
  795. }
  796. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  797. _mm_maskload_pd (double const *__P, __m128i __M)
  798. {
  799. return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
  800. (__v2di)__M);
  801. }
  802. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  803. _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
  804. {
  805. __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
  806. }
  807. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  808. _mm256_maskload_pd (double const *__P, __m256i __M)
  809. {
  810. return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
  811. (__v4di)__M);
  812. }
  813. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  814. _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
  815. {
  816. __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
  817. }
  818. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  819. _mm_maskload_ps (float const *__P, __m128i __M)
  820. {
  821. return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
  822. (__v4si)__M);
  823. }
  824. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  825. _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
  826. {
  827. __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
  828. }
  829. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  830. _mm256_maskload_ps (float const *__P, __m256i __M)
  831. {
  832. return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
  833. (__v8si)__M);
  834. }
  835. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  836. _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
  837. {
  838. __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
  839. }
  840. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  841. _mm256_movehdup_ps (__m256 __X)
  842. {
  843. return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
  844. }
  845. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  846. _mm256_moveldup_ps (__m256 __X)
  847. {
  848. return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
  849. }
  850. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  851. _mm256_movedup_pd (__m256d __X)
  852. {
  853. return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
  854. }
  855. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  856. _mm256_lddqu_si256 (__m256i const *__P)
  857. {
  858. return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
  859. }
  860. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  861. _mm256_stream_si256 (__m256i *__A, __m256i __B)
  862. {
  863. __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
  864. }
  865. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  866. _mm256_stream_pd (double *__A, __m256d __B)
  867. {
  868. __builtin_ia32_movntpd256 (__A, (__v4df)__B);
  869. }
  870. extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  871. _mm256_stream_ps (float *__P, __m256 __A)
  872. {
  873. __builtin_ia32_movntps256 (__P, (__v8sf)__A);
  874. }
  875. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  876. _mm256_rcp_ps (__m256 __A)
  877. {
  878. return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
  879. }
  880. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  881. _mm256_rsqrt_ps (__m256 __A)
  882. {
  883. return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
  884. }
  885. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  886. _mm256_sqrt_pd (__m256d __A)
  887. {
  888. return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
  889. }
  890. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  891. _mm256_sqrt_ps (__m256 __A)
  892. {
  893. return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
  894. }
  895. #ifdef __OPTIMIZE__
  896. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  897. _mm256_round_pd (__m256d __V, const int __M)
  898. {
  899. return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
  900. }
  901. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  902. _mm256_round_ps (__m256 __V, const int __M)
  903. {
  904. return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
  905. }
  906. #else
  907. #define _mm256_round_pd(V, M) \
  908. ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
  909. #define _mm256_round_ps(V, M) \
  910. ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
  911. #endif
  912. #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
  913. #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
  914. #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
  915. #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
  916. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  917. _mm256_unpackhi_pd (__m256d __A, __m256d __B)
  918. {
  919. return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
  920. }
  921. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  922. _mm256_unpacklo_pd (__m256d __A, __m256d __B)
  923. {
  924. return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
  925. }
  926. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  927. _mm256_unpackhi_ps (__m256 __A, __m256 __B)
  928. {
  929. return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
  930. }
  931. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  932. _mm256_unpacklo_ps (__m256 __A, __m256 __B)
  933. {
  934. return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
  935. }
  936. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  937. _mm_testz_pd (__m128d __M, __m128d __V)
  938. {
  939. return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
  940. }
  941. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  942. _mm_testc_pd (__m128d __M, __m128d __V)
  943. {
  944. return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
  945. }
  946. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  947. _mm_testnzc_pd (__m128d __M, __m128d __V)
  948. {
  949. return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
  950. }
  951. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  952. _mm_testz_ps (__m128 __M, __m128 __V)
  953. {
  954. return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
  955. }
  956. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  957. _mm_testc_ps (__m128 __M, __m128 __V)
  958. {
  959. return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
  960. }
  961. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  962. _mm_testnzc_ps (__m128 __M, __m128 __V)
  963. {
  964. return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
  965. }
  966. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  967. _mm256_testz_pd (__m256d __M, __m256d __V)
  968. {
  969. return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
  970. }
  971. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  972. _mm256_testc_pd (__m256d __M, __m256d __V)
  973. {
  974. return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
  975. }
  976. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  977. _mm256_testnzc_pd (__m256d __M, __m256d __V)
  978. {
  979. return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
  980. }
  981. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  982. _mm256_testz_ps (__m256 __M, __m256 __V)
  983. {
  984. return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
  985. }
  986. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  987. _mm256_testc_ps (__m256 __M, __m256 __V)
  988. {
  989. return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
  990. }
  991. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  992. _mm256_testnzc_ps (__m256 __M, __m256 __V)
  993. {
  994. return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
  995. }
  996. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  997. _mm256_testz_si256 (__m256i __M, __m256i __V)
  998. {
  999. return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
  1000. }
  1001. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1002. _mm256_testc_si256 (__m256i __M, __m256i __V)
  1003. {
  1004. return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
  1005. }
  1006. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1007. _mm256_testnzc_si256 (__m256i __M, __m256i __V)
  1008. {
  1009. return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
  1010. }
  1011. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1012. _mm256_movemask_pd (__m256d __A)
  1013. {
  1014. return __builtin_ia32_movmskpd256 ((__v4df)__A);
  1015. }
  1016. extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1017. _mm256_movemask_ps (__m256 __A)
  1018. {
  1019. return __builtin_ia32_movmskps256 ((__v8sf)__A);
  1020. }
  1021. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1022. _mm256_undefined_pd (void)
  1023. {
  1024. __m256d __Y = __Y;
  1025. return __Y;
  1026. }
  1027. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1028. _mm256_undefined_ps (void)
  1029. {
  1030. __m256 __Y = __Y;
  1031. return __Y;
  1032. }
  1033. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1034. _mm256_undefined_si256 (void)
  1035. {
  1036. __m256i __Y = __Y;
  1037. return __Y;
  1038. }
  1039. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1040. _mm256_setzero_pd (void)
  1041. {
  1042. return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
  1043. }
  1044. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1045. _mm256_setzero_ps (void)
  1046. {
  1047. return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
  1048. 0.0, 0.0, 0.0, 0.0 };
  1049. }
  1050. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1051. _mm256_setzero_si256 (void)
  1052. {
  1053. return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
  1054. }
  1055. /* Create the vector [A B C D]. */
  1056. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1057. _mm256_set_pd (double __A, double __B, double __C, double __D)
  1058. {
  1059. return __extension__ (__m256d){ __D, __C, __B, __A };
  1060. }
  1061. /* Create the vector [A B C D E F G H]. */
  1062. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1063. _mm256_set_ps (float __A, float __B, float __C, float __D,
  1064. float __E, float __F, float __G, float __H)
  1065. {
  1066. return __extension__ (__m256){ __H, __G, __F, __E,
  1067. __D, __C, __B, __A };
  1068. }
  1069. /* Create the vector [A B C D E F G H]. */
  1070. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1071. _mm256_set_epi32 (int __A, int __B, int __C, int __D,
  1072. int __E, int __F, int __G, int __H)
  1073. {
  1074. return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
  1075. __D, __C, __B, __A };
  1076. }
  1077. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1078. _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
  1079. short __q11, short __q10, short __q09, short __q08,
  1080. short __q07, short __q06, short __q05, short __q04,
  1081. short __q03, short __q02, short __q01, short __q00)
  1082. {
  1083. return __extension__ (__m256i)(__v16hi){
  1084. __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
  1085. __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
  1086. };
  1087. }
  1088. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1089. _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
  1090. char __q27, char __q26, char __q25, char __q24,
  1091. char __q23, char __q22, char __q21, char __q20,
  1092. char __q19, char __q18, char __q17, char __q16,
  1093. char __q15, char __q14, char __q13, char __q12,
  1094. char __q11, char __q10, char __q09, char __q08,
  1095. char __q07, char __q06, char __q05, char __q04,
  1096. char __q03, char __q02, char __q01, char __q00)
  1097. {
  1098. return __extension__ (__m256i)(__v32qi){
  1099. __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
  1100. __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
  1101. __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
  1102. __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
  1103. };
  1104. }
  1105. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1106. _mm256_set_epi64x (long long __A, long long __B, long long __C,
  1107. long long __D)
  1108. {
  1109. return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
  1110. }
  1111. /* Create a vector with all elements equal to A. */
  1112. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1113. _mm256_set1_pd (double __A)
  1114. {
  1115. return __extension__ (__m256d){ __A, __A, __A, __A };
  1116. }
  1117. /* Create a vector with all elements equal to A. */
  1118. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1119. _mm256_set1_ps (float __A)
  1120. {
  1121. return __extension__ (__m256){ __A, __A, __A, __A,
  1122. __A, __A, __A, __A };
  1123. }
  1124. /* Create a vector with all elements equal to A. */
  1125. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1126. _mm256_set1_epi32 (int __A)
  1127. {
  1128. return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
  1129. __A, __A, __A, __A };
  1130. }
  1131. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1132. _mm256_set1_epi16 (short __A)
  1133. {
  1134. return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
  1135. __A, __A, __A, __A, __A, __A, __A, __A);
  1136. }
  1137. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1138. _mm256_set1_epi8 (char __A)
  1139. {
  1140. return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
  1141. __A, __A, __A, __A, __A, __A, __A, __A,
  1142. __A, __A, __A, __A, __A, __A, __A, __A,
  1143. __A, __A, __A, __A, __A, __A, __A, __A);
  1144. }
  1145. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1146. _mm256_set1_epi64x (long long __A)
  1147. {
  1148. return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
  1149. }
  1150. /* Create vectors of elements in the reversed order from the
  1151. _mm256_set_XXX functions. */
  1152. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1153. _mm256_setr_pd (double __A, double __B, double __C, double __D)
  1154. {
  1155. return _mm256_set_pd (__D, __C, __B, __A);
  1156. }
  1157. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1158. _mm256_setr_ps (float __A, float __B, float __C, float __D,
  1159. float __E, float __F, float __G, float __H)
  1160. {
  1161. return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
  1162. }
  1163. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1164. _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
  1165. int __E, int __F, int __G, int __H)
  1166. {
  1167. return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
  1168. }
  1169. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1170. _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
  1171. short __q11, short __q10, short __q09, short __q08,
  1172. short __q07, short __q06, short __q05, short __q04,
  1173. short __q03, short __q02, short __q01, short __q00)
  1174. {
  1175. return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
  1176. __q04, __q05, __q06, __q07,
  1177. __q08, __q09, __q10, __q11,
  1178. __q12, __q13, __q14, __q15);
  1179. }
  1180. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1181. _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
  1182. char __q27, char __q26, char __q25, char __q24,
  1183. char __q23, char __q22, char __q21, char __q20,
  1184. char __q19, char __q18, char __q17, char __q16,
  1185. char __q15, char __q14, char __q13, char __q12,
  1186. char __q11, char __q10, char __q09, char __q08,
  1187. char __q07, char __q06, char __q05, char __q04,
  1188. char __q03, char __q02, char __q01, char __q00)
  1189. {
  1190. return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
  1191. __q04, __q05, __q06, __q07,
  1192. __q08, __q09, __q10, __q11,
  1193. __q12, __q13, __q14, __q15,
  1194. __q16, __q17, __q18, __q19,
  1195. __q20, __q21, __q22, __q23,
  1196. __q24, __q25, __q26, __q27,
  1197. __q28, __q29, __q30, __q31);
  1198. }
  1199. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1200. _mm256_setr_epi64x (long long __A, long long __B, long long __C,
  1201. long long __D)
  1202. {
  1203. return _mm256_set_epi64x (__D, __C, __B, __A);
  1204. }
  1205. /* Casts between various SP, DP, INT vector types. Note that these do no
  1206. conversion of values, they just change the type. */
  1207. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1208. _mm256_castpd_ps (__m256d __A)
  1209. {
  1210. return (__m256) __A;
  1211. }
  1212. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1213. _mm256_castpd_si256 (__m256d __A)
  1214. {
  1215. return (__m256i) __A;
  1216. }
  1217. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1218. _mm256_castps_pd (__m256 __A)
  1219. {
  1220. return (__m256d) __A;
  1221. }
  1222. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1223. _mm256_castps_si256(__m256 __A)
  1224. {
  1225. return (__m256i) __A;
  1226. }
  1227. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1228. _mm256_castsi256_ps (__m256i __A)
  1229. {
  1230. return (__m256) __A;
  1231. }
  1232. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1233. _mm256_castsi256_pd (__m256i __A)
  1234. {
  1235. return (__m256d) __A;
  1236. }
  1237. extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1238. _mm256_castpd256_pd128 (__m256d __A)
  1239. {
  1240. return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
  1241. }
  1242. extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1243. _mm256_castps256_ps128 (__m256 __A)
  1244. {
  1245. return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
  1246. }
  1247. extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1248. _mm256_castsi256_si128 (__m256i __A)
  1249. {
  1250. return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
  1251. }
  1252. /* When cast is done from a 128 to 256-bit type, the low 128 bits of
  1253. the 256-bit result contain source parameter value and the upper 128
  1254. bits of the result are undefined. Those intrinsics shouldn't
  1255. generate any extra moves. */
  1256. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1257. _mm256_castpd128_pd256 (__m128d __A)
  1258. {
  1259. return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
  1260. }
  1261. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1262. _mm256_castps128_ps256 (__m128 __A)
  1263. {
  1264. return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
  1265. }
  1266. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1267. _mm256_castsi128_si256 (__m128i __A)
  1268. {
  1269. return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
  1270. }
  1271. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1272. _mm256_set_m128 ( __m128 __H, __m128 __L)
  1273. {
  1274. return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
  1275. }
  1276. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1277. _mm256_set_m128d (__m128d __H, __m128d __L)
  1278. {
  1279. return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
  1280. }
  1281. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1282. _mm256_set_m128i (__m128i __H, __m128i __L)
  1283. {
  1284. return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
  1285. }
  1286. extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1287. _mm256_setr_m128 (__m128 __L, __m128 __H)
  1288. {
  1289. return _mm256_set_m128 (__H, __L);
  1290. }
  1291. extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1292. _mm256_setr_m128d (__m128d __L, __m128d __H)
  1293. {
  1294. return _mm256_set_m128d (__H, __L);
  1295. }
  1296. extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1297. _mm256_setr_m128i (__m128i __L, __m128i __H)
  1298. {
  1299. return _mm256_set_m128i (__H, __L);
  1300. }
  1301. #ifdef __DISABLE_AVX__
  1302. #undef __DISABLE_AVX__
  1303. #pragma GCC pop_options
  1304. #endif /* __DISABLE_AVX__ */
  1305. #endif /* _AVXINTRIN_H_INCLUDED */