dvec.h 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. /**
  2. * This file has no copyright assigned and is placed in the Public Domain.
  3. * This file is part of the mingw-w64 runtime package.
  4. * No warranty is given; refer to the file DISCLAIMER.PD within this package.
  5. */
  6. #ifndef _DVEC_H_INCLUDED
  7. #define _DVEC_H_INCLUDED
  8. #ifndef RC_INVOKED
  9. #if !defined __cplusplus
  10. #error This file is only supported in C++ compilations!
  11. #endif
  12. #include <intrin.h>
  13. #include <assert.h>
  14. #include <fvec.h>
  15. #include <crtdefs.h>
  16. #pragma pack(push,_CRT_PACKING)
  17. #if defined(_ENABLE_VEC_DEBUG)
  18. #include <iostream>
  19. #endif
  20. #ifdef __SSE__
  21. #pragma pack(push,16)
  22. #define EXPLICIT explicit
  23. class I8vec16;
  24. class Is8vec16;
  25. class Iu8vec16;
  26. class I16vec8;
  27. class Is16vec8;
  28. class Iu16vec8;
  29. class I32vec4;
  30. class Is32vec4;
  31. class Iu32vec4;
  32. class I64vec2;
  33. class I128vec1;
  34. #define _MM_16UB(element,vector) (*((unsigned char*)&(vector) + (element)))
  35. #define _MM_16B(element,vector) (*((signed char*)&(vector) + (element)))
  36. #define _MM_8UW(element,vector) (*((unsigned short*)&(vector) + (element)))
  37. #define _MM_8W(element,vector) (*((short*)&(vector) + (element)))
  38. #define _MM_4UDW(element,vector) (*((unsigned int*)&(vector) + (element)))
  39. #define _MM_4DW(element,vector) (*((int*)&(vector) + (element)))
  40. #define _MM_2QW(element,vector) (*((__int64*)&(vector) + (element)))
  41. __MINGW_EXTENSION inline const __m128i get_mask128()
  42. {
  43. static const __m128i mask128 = _mm_set1_epi64(M64((__int64)0xffffffffffffffffll));
  44. return mask128;
  45. }
  46. class M128
  47. {
  48. protected:
  49. __m128i vec;
  50. public:
  51. M128() { }
  52. M128(__m128i mm) { vec = mm; }
  53. operator __m128i() const { return vec; }
  54. M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
  55. M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
  56. M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
  57. };
  58. inline M128 operator&(const M128 &a,const M128 &b) { return _mm_and_si128(a,b); }
  59. inline M128 operator|(const M128 &a,const M128 &b) { return _mm_or_si128(a,b); }
  60. inline M128 operator^(const M128 &a,const M128 &b) { return _mm_xor_si128(a,b); }
  61. inline M128 andnot(const M128 &a,const M128 &b) { return _mm_andnot_si128(a,b); }
  62. class I128vec1 : public M128
  63. {
  64. public:
  65. I128vec1() { }
  66. I128vec1(__m128i mm) : M128(mm) { }
  67. I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
  68. I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
  69. I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
  70. I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
  71. };
  72. class I64vec2 : public M128
  73. {
  74. public:
  75. I64vec2() { }
  76. I64vec2(__m128i mm) : M128(mm) { }
  77. __MINGW_EXTENSION I64vec2(__m64 q1,__m64 q0)
  78. {
  79. _MM_2QW(0,vec) = *(__int64*)&q0;
  80. _MM_2QW(1,vec) = *(__int64*)&q1;
  81. }
  82. I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
  83. I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
  84. I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
  85. I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
  86. I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
  87. I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
  88. I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
  89. I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
  90. I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
  91. I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
  92. I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
  93. I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
  94. I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
  95. I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
  96. __MINGW_EXTENSION const __int64& operator[](int i)const
  97. {
  98. assert(static_cast<unsigned int>(i) < 2);
  99. return _MM_2QW(i,vec);
  100. }
  101. __MINGW_EXTENSION __int64& operator[](int i)
  102. {
  103. assert(static_cast<unsigned int>(i) < 2);
  104. return _MM_2QW(i,vec);
  105. }
  106. };
  107. inline I64vec2 unpack_low(const I64vec2 &a,const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
  108. inline I64vec2 unpack_high(const I64vec2 &a,const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
  109. class I32vec4 : public M128
  110. {
  111. public:
  112. I32vec4() { }
  113. I32vec4(__m128i mm) : M128(mm) { }
  114. I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
  115. I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
  116. I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
  117. I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
  118. I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
  119. I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
  120. I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
  121. I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
  122. I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
  123. I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
  124. };
  125. inline I32vec4 cmpeq(const I32vec4 &a,const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
  126. inline I32vec4 cmpneq(const I32vec4 &a,const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
  127. inline I32vec4 unpack_low(const I32vec4 &a,const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
  128. inline I32vec4 unpack_high(const I32vec4 &a,const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
  129. class Is32vec4 : public I32vec4
  130. {
  131. public:
  132. Is32vec4() { }
  133. Is32vec4(__m128i mm) : I32vec4(mm) { }
  134. Is32vec4(int i3,int i2,int i1,int i0)
  135. {
  136. _MM_4DW(0,vec) = i0;
  137. _MM_4DW(1,vec) = i1;
  138. _MM_4DW(2,vec) = i2;
  139. _MM_4DW(3,vec) = i3;
  140. }
  141. Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
  142. Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
  143. Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
  144. Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
  145. Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
  146. Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
  147. Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
  148. Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
  149. Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
  150. Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
  151. Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
  152. Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
  153. Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
  154. Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
  155. #if defined(_ENABLE_VEC_DEBUG)
  156. friend std::ostream& operator<< (std::ostream &os,const Is32vec4 &a)
  157. {
  158. os << "[3]:" << _MM_4DW(3,a)
  159. << " [2]:" << _MM_4DW(2,a)
  160. << " [1]:" << _MM_4DW(1,a)
  161. << " [0]:" << _MM_4DW(0,a);
  162. return os;
  163. }
  164. #endif
  165. const int& operator[](int i)const
  166. {
  167. assert(static_cast<unsigned int>(i) < 4);
  168. return _MM_4DW(i,vec);
  169. }
  170. int& operator[](int i)
  171. {
  172. assert(static_cast<unsigned int>(i) < 4);
  173. return _MM_4DW(i,vec);
  174. }
  175. };
  176. inline Is32vec4 cmpeq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
  177. inline Is32vec4 cmpneq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
  178. inline Is32vec4 cmpgt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
  179. inline Is32vec4 cmplt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
  180. inline Is32vec4 unpack_low(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
  181. inline Is32vec4 unpack_high(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
  182. class Iu32vec4 : public I32vec4
  183. {
  184. public:
  185. Iu32vec4() { }
  186. Iu32vec4(__m128i mm) : I32vec4(mm) { }
  187. Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)
  188. {
  189. _MM_4UDW(0,vec) = ui0;
  190. _MM_4UDW(1,vec) = ui1;
  191. _MM_4UDW(2,vec) = ui2;
  192. _MM_4UDW(3,vec) = ui3;
  193. }
  194. Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
  195. Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
  196. Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
  197. Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
  198. Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
  199. Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
  200. Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
  201. Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
  202. Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
  203. Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
  204. Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
  205. Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
  206. Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
  207. Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
  208. #if defined(_ENABLE_VEC_DEBUG)
  209. friend std::ostream& operator<< (std::ostream &os,const Iu32vec4 &a)
  210. {
  211. os << "[3]:" << _MM_4UDW(3,a)
  212. << " [2]:" << _MM_4UDW(2,a)
  213. << " [1]:" << _MM_4UDW(1,a)
  214. << " [0]:" << _MM_4UDW(0,a);
  215. return os;
  216. }
  217. #endif
  218. const unsigned int& operator[](int i)const
  219. {
  220. assert(static_cast<unsigned int>(i) < 4);
  221. return _MM_4UDW(i,vec);
  222. }
  223. unsigned int& operator[](int i)
  224. {
  225. assert(static_cast<unsigned int>(i) < 4);
  226. return _MM_4UDW(i,vec);
  227. }
  228. };
  229. inline I64vec2 operator*(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
  230. inline Iu32vec4 cmpeq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
  231. inline Iu32vec4 cmpneq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
  232. inline Iu32vec4 unpack_low(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
  233. inline Iu32vec4 unpack_high(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
  234. class I16vec8 : public M128
  235. {
  236. public:
  237. I16vec8() { }
  238. I16vec8(__m128i mm) : M128(mm) { }
  239. I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
  240. I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
  241. I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
  242. I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
  243. I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
  244. I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
  245. I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
  246. I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
  247. I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
  248. I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
  249. I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
  250. };
  251. inline I16vec8 operator*(const I16vec8 &a,const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
  252. inline I16vec8 cmpeq(const I16vec8 &a,const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
  253. inline I16vec8 cmpneq(const I16vec8 &a,const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
  254. inline I16vec8 unpack_low(const I16vec8 &a,const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
  255. inline I16vec8 unpack_high(const I16vec8 &a,const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
  256. class Is16vec8 : public I16vec8
  257. {
  258. public:
  259. Is16vec8() { }
  260. Is16vec8(__m128i mm) : I16vec8(mm) { }
  261. Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
  262. {
  263. _MM_8W(0,vec) = s0;
  264. _MM_8W(1,vec) = s1;
  265. _MM_8W(2,vec) = s2;
  266. _MM_8W(3,vec) = s3;
  267. _MM_8W(4,vec) = s4;
  268. _MM_8W(5,vec) = s5;
  269. _MM_8W(6,vec) = s6;
  270. _MM_8W(7,vec) = s7;
  271. }
  272. Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
  273. Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
  274. Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
  275. Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
  276. Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
  277. Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
  278. Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
  279. Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
  280. Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
  281. Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
  282. Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
  283. Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
  284. Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
  285. Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
  286. Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
  287. #if defined(_ENABLE_VEC_DEBUG)
  288. friend std::ostream& operator<< (std::ostream &os,const Is16vec8 &a)
  289. {
  290. os << "[7]:" << _MM_8W(7,a)
  291. << " [6]:" << _MM_8W(6,a)
  292. << " [5]:" << _MM_8W(5,a)
  293. << " [4]:" << _MM_8W(4,a)
  294. << " [3]:" << _MM_8W(3,a)
  295. << " [2]:" << _MM_8W(2,a)
  296. << " [1]:" << _MM_8W(1,a)
  297. << " [0]:" << _MM_8W(0,a);
  298. return os;
  299. }
  300. #endif
  301. const signed short& operator[](int i)const
  302. {
  303. assert(static_cast<unsigned int>(i) < 8);
  304. return _MM_8W(i,vec);
  305. }
  306. signed short& operator[](int i)
  307. {
  308. assert(static_cast<unsigned int>(i) < 8);
  309. return _MM_8W(i,vec);
  310. }
  311. };
  312. inline Is16vec8 operator*(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
  313. inline Is16vec8 cmpeq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
  314. inline Is16vec8 cmpneq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
  315. inline Is16vec8 cmpgt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
  316. inline Is16vec8 cmplt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
  317. inline Is16vec8 unpack_low(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
  318. inline Is16vec8 unpack_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
  319. inline Is16vec8 mul_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
  320. inline Is32vec4 mul_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
  321. inline Is16vec8 sat_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
  322. inline Is16vec8 sat_sub(const Is16vec8 &a,const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
  323. inline Is16vec8 simd_max(const Is16vec8 &a,const Is16vec8 &b) { return _mm_max_epi16(a,b); }
  324. inline Is16vec8 simd_min(const Is16vec8 &a,const Is16vec8 &b) { return _mm_min_epi16(a,b); }
  325. class Iu16vec8 : public I16vec8
  326. {
  327. public:
  328. Iu16vec8() { }
  329. Iu16vec8(__m128i mm) : I16vec8(mm) { }
  330. Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
  331. {
  332. _MM_8UW(0,vec) = s0;
  333. _MM_8UW(1,vec) = s1;
  334. _MM_8UW(2,vec) = s2;
  335. _MM_8UW(3,vec) = s3;
  336. _MM_8UW(4,vec) = s4;
  337. _MM_8UW(5,vec) = s5;
  338. _MM_8UW(6,vec) = s6;
  339. _MM_8UW(7,vec) = s7;
  340. }
  341. Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
  342. Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
  343. Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
  344. Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
  345. Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
  346. Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
  347. Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
  348. Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
  349. Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
  350. Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
  351. Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
  352. Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
  353. Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
  354. Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
  355. Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
  356. #if defined(_ENABLE_VEC_DEBUG)
  357. friend std::ostream& operator << (std::ostream &os,const Iu16vec8 &a)
  358. {
  359. os << "[7]:" << unsigned short(_MM_8UW(7,a))
  360. << " [6]:" << unsigned short(_MM_8UW(6,a))
  361. << " [5]:" << unsigned short(_MM_8UW(5,a))
  362. << " [4]:" << unsigned short(_MM_8UW(4,a))
  363. << " [3]:" << unsigned short(_MM_8UW(3,a))
  364. << " [2]:" << unsigned short(_MM_8UW(2,a))
  365. << " [1]:" << unsigned short(_MM_8UW(1,a))
  366. << " [0]:" << unsigned short(_MM_8UW(0,a));
  367. return os;
  368. }
  369. #endif
  370. const unsigned short& operator[](int i)const
  371. {
  372. assert(static_cast<unsigned int>(i) < 8);
  373. return _MM_8UW(i,vec);
  374. }
  375. unsigned short& operator[](int i)
  376. {
  377. assert(static_cast<unsigned int>(i) < 8);
  378. return _MM_8UW(i,vec);
  379. }
  380. };
  381. inline Iu16vec8 operator*(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
  382. inline Iu16vec8 cmpeq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
  383. inline Iu16vec8 cmpneq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
  384. inline Iu16vec8 unpack_low(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
  385. inline Iu16vec8 unpack_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
  386. inline Iu16vec8 sat_add(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
  387. inline Iu16vec8 sat_sub(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
  388. inline Iu16vec8 simd_avg(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
  389. inline I16vec8 mul_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
  390. class I8vec16 : public M128
  391. {
  392. public:
  393. I8vec16() { }
  394. I8vec16(__m128i mm) : M128(mm) { }
  395. I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
  396. I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
  397. I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
  398. I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
  399. I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
  400. I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
  401. };
  402. inline I8vec16 cmpeq(const I8vec16 &a,const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
  403. inline I8vec16 cmpneq(const I8vec16 &a,const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
  404. inline I8vec16 unpack_low(const I8vec16 &a,const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
  405. inline I8vec16 unpack_high(const I8vec16 &a,const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
  406. class Is8vec16 : public I8vec16
  407. {
  408. public:
  409. Is8vec16() { }
  410. Is8vec16(__m128i mm) : I8vec16(mm) { }
  411. Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
  412. Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
  413. Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
  414. Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
  415. Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
  416. Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
  417. #if defined(_ENABLE_VEC_DEBUG)
  418. friend std::ostream& operator << (std::ostream &os,const Is8vec16 &a)
  419. {
  420. os << "[15]:" << short(_MM_16B(15,a))
  421. << " [14]:" << short(_MM_16B(14,a))
  422. << " [13]:" << short(_MM_16B(13,a))
  423. << " [12]:" << short(_MM_16B(12,a))
  424. << " [11]:" << short(_MM_16B(11,a))
  425. << " [10]:" << short(_MM_16B(10,a))
  426. << " [9]:" << short(_MM_16B(9,a))
  427. << " [8]:" << short(_MM_16B(8,a))
  428. << " [7]:" << short(_MM_16B(7,a))
  429. << " [6]:" << short(_MM_16B(6,a))
  430. << " [5]:" << short(_MM_16B(5,a))
  431. << " [4]:" << short(_MM_16B(4,a))
  432. << " [3]:" << short(_MM_16B(3,a))
  433. << " [2]:" << short(_MM_16B(2,a))
  434. << " [1]:" << short(_MM_16B(1,a))
  435. << " [0]:" << short(_MM_16B(0,a));
  436. return os;
  437. }
  438. #endif
  439. const signed char& operator[](int i)const
  440. {
  441. assert(static_cast<unsigned int>(i) < 16);
  442. return _MM_16B(i,vec);
  443. }
  444. signed char& operator[](int i)
  445. {
  446. assert(static_cast<unsigned int>(i) < 16);
  447. return _MM_16B(i,vec);
  448. }
  449. };
  450. inline Is8vec16 cmpeq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
  451. inline Is8vec16 cmpneq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
  452. inline Is8vec16 cmpgt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
  453. inline Is8vec16 cmplt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
  454. inline Is8vec16 unpack_low(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
  455. inline Is8vec16 unpack_high(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
  456. inline Is8vec16 sat_add(const Is8vec16 &a,const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
  457. inline Is8vec16 sat_sub(const Is8vec16 &a,const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
  458. class Iu8vec16 : public I8vec16
  459. {
  460. public:
  461. Iu8vec16() { }
  462. Iu8vec16(__m128i mm) : I8vec16(mm) { }
  463. Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
  464. Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
  465. Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
  466. Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
  467. Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
  468. Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
  469. #if defined(_ENABLE_VEC_DEBUG)
  470. friend std::ostream& operator << (std::ostream &os,const Iu8vec16 &a)
  471. {
  472. os << "[15]:" << unsigned short(_MM_16UB(15,a))
  473. << " [14]:" << unsigned short(_MM_16UB(14,a))
  474. << " [13]:" << unsigned short(_MM_16UB(13,a))
  475. << " [12]:" << unsigned short(_MM_16UB(12,a))
  476. << " [11]:" << unsigned short(_MM_16UB(11,a))
  477. << " [10]:" << unsigned short(_MM_16UB(10,a))
  478. << " [9]:" << unsigned short(_MM_16UB(9,a))
  479. << " [8]:" << unsigned short(_MM_16UB(8,a))
  480. << " [7]:" << unsigned short(_MM_16UB(7,a))
  481. << " [6]:" << unsigned short(_MM_16UB(6,a))
  482. << " [5]:" << unsigned short(_MM_16UB(5,a))
  483. << " [4]:" << unsigned short(_MM_16UB(4,a))
  484. << " [3]:" << unsigned short(_MM_16UB(3,a))
  485. << " [2]:" << unsigned short(_MM_16UB(2,a))
  486. << " [1]:" << unsigned short(_MM_16UB(1,a))
  487. << " [0]:" << unsigned short(_MM_16UB(0,a));
  488. return os;
  489. }
  490. #endif
  491. const unsigned char& operator[](int i)const
  492. {
  493. assert(static_cast<unsigned int>(i) < 16);
  494. return _MM_16UB(i,vec);
  495. }
  496. unsigned char& operator[](int i)
  497. {
  498. assert(static_cast<unsigned int>(i) < 16);
  499. return _MM_16UB(i,vec);
  500. }
  501. };
  502. inline Iu8vec16 cmpeq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
  503. inline Iu8vec16 cmpneq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
  504. inline Iu8vec16 unpack_low(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
  505. inline Iu8vec16 unpack_high(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
  506. inline Iu8vec16 sat_add(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
  507. inline Iu8vec16 sat_sub(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
  508. inline I64vec2 sum_abs(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
  509. inline Iu8vec16 simd_avg(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
  510. inline Iu8vec16 simd_max(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
  511. inline Iu8vec16 simd_min(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
  512. inline Is16vec8 pack_sat(const Is32vec4 &a,const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
  513. inline Is8vec16 pack_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
  514. inline Iu8vec16 packu_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
  515. #define IVEC128_LOGICALS(vect,element) inline I##vect##vec##element operator& (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_and_si128(a,b); } inline I##vect##vec##element operator| (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_or_si128(a,b); } inline I##vect##vec##element operator^ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_xor_si128(a,b); } inline I##vect##vec##element andnot (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_andnot_si128(a,b); }
  516. IVEC128_LOGICALS(8,16)
  517. IVEC128_LOGICALS(u8,16)
  518. IVEC128_LOGICALS(s8,16)
  519. IVEC128_LOGICALS(16,8)
  520. IVEC128_LOGICALS(u16,8)
  521. IVEC128_LOGICALS(s16,8)
  522. IVEC128_LOGICALS(32,4)
  523. IVEC128_LOGICALS(u32,4)
  524. IVEC128_LOGICALS(s32,4)
  525. IVEC128_LOGICALS(64,2)
  526. IVEC128_LOGICALS(128,1)
  527. #undef IVEC128_LOGICALS
  528. #define IVEC128_ADD_SUB(vect,element,opsize) inline I##vect##vec##element operator+ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_add_##opsize(a,b); } inline I##vect##vec##element operator- (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_sub_##opsize(a,b); }
  529. IVEC128_ADD_SUB(8,16,epi8)
  530. IVEC128_ADD_SUB(u8,16,epi8)
  531. IVEC128_ADD_SUB(s8,16,epi8)
  532. IVEC128_ADD_SUB(16,8,epi16)
  533. IVEC128_ADD_SUB(u16,8,epi16)
  534. IVEC128_ADD_SUB(s16,8,epi16)
  535. IVEC128_ADD_SUB(32,4,epi32)
  536. IVEC128_ADD_SUB(u32,4,epi32)
  537. IVEC128_ADD_SUB(s32,4,epi32)
  538. IVEC128_ADD_SUB(64,2,epi64)
  539. #undef IVEC128_ADD_SUB
  540. #define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a,const I##vect12##vec##element &b,const I##vect34##vec##element &c,const I##vect34##vec##element &d) { I##vect12##vec##element mask = cmp##selop(a,b); return(I##vect34##vec##element ((mask & arg1) | I##vect12##vec##element ((_mm_andnot_si128(mask,arg2))))); }
  541. IVEC128_SELECT(8,s8,16,eq,c,d)
  542. IVEC128_SELECT(8,u8,16,eq,c,d)
  543. IVEC128_SELECT(8,8,16,eq,c,d)
  544. IVEC128_SELECT(8,s8,16,neq,c,d)
  545. IVEC128_SELECT(8,u8,16,neq,c,d)
  546. IVEC128_SELECT(8,8,16,neq,c,d)
  547. IVEC128_SELECT(16,s16,8,eq,c,d)
  548. IVEC128_SELECT(16,u16,8,eq,c,d)
  549. IVEC128_SELECT(16,16,8,eq,c,d)
  550. IVEC128_SELECT(16,s16,8,neq,c,d)
  551. IVEC128_SELECT(16,u16,8,neq,c,d)
  552. IVEC128_SELECT(16,16,8,neq,c,d)
  553. IVEC128_SELECT(32,s32,4,eq,c,d)
  554. IVEC128_SELECT(32,u32,4,eq,c,d)
  555. IVEC128_SELECT(32,32,4,eq,c,d)
  556. IVEC128_SELECT(32,s32,4,neq,c,d)
  557. IVEC128_SELECT(32,u32,4,neq,c,d)
  558. IVEC128_SELECT(32,32,4,neq,c,d)
  559. IVEC128_SELECT(s8,s8,16,gt,c,d)
  560. IVEC128_SELECT(s8,u8,16,gt,c,d)
  561. IVEC128_SELECT(s8,8,16,gt,c,d)
  562. IVEC128_SELECT(s8,s8,16,lt,c,d)
  563. IVEC128_SELECT(s8,u8,16,lt,c,d)
  564. IVEC128_SELECT(s8,8,16,lt,c,d)
  565. IVEC128_SELECT(s16,s16,8,gt,c,d)
  566. IVEC128_SELECT(s16,u16,8,gt,c,d)
  567. IVEC128_SELECT(s16,16,8,gt,c,d)
  568. IVEC128_SELECT(s16,s16,8,lt,c,d)
  569. IVEC128_SELECT(s16,u16,8,lt,c,d)
  570. IVEC128_SELECT(s16,16,8,lt,c,d)
  571. #undef IVEC128_SELECT
  572. class F64vec2
  573. {
  574. protected:
  575. __m128d vec;
  576. public:
  577. F64vec2() {}
  578. F64vec2(__m128d m) { vec = m;}
  579. F64vec2(double d1,double d0) { vec= _mm_set_pd(d1,d0); }
  580. EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
  581. operator __m128d() const { return vec; }
  582. friend F64vec2 operator &(const F64vec2 &a,const F64vec2 &b) { return _mm_and_pd(a,b); }
  583. friend F64vec2 operator |(const F64vec2 &a,const F64vec2 &b) { return _mm_or_pd(a,b); }
  584. friend F64vec2 operator ^(const F64vec2 &a,const F64vec2 &b) { return _mm_xor_pd(a,b); }
  585. friend F64vec2 operator +(const F64vec2 &a,const F64vec2 &b) { return _mm_add_pd(a,b); }
  586. friend F64vec2 operator -(const F64vec2 &a,const F64vec2 &b) { return _mm_sub_pd(a,b); }
  587. friend F64vec2 operator *(const F64vec2 &a,const F64vec2 &b) { return _mm_mul_pd(a,b); }
  588. friend F64vec2 operator /(const F64vec2 &a,const F64vec2 &b) { return _mm_div_pd(a,b); }
  589. F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
  590. F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
  591. F64vec2& operator *=(F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
  592. F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
  593. F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
  594. F64vec2& operator |=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
  595. F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
  596. friend double add_horizontal(F64vec2 &a)
  597. {
  598. F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a,a,1));
  599. return ftemp[0];
  600. }
  601. friend F64vec2 andnot(const F64vec2 &a,const F64vec2 &b) { return _mm_andnot_pd(a,b); }
  602. friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
  603. #define F64vec2_COMP(op) friend F64vec2 cmp##op (const F64vec2 &a,const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
  604. F64vec2_COMP(eq)
  605. F64vec2_COMP(lt)
  606. F64vec2_COMP(le)
  607. F64vec2_COMP(gt)
  608. F64vec2_COMP(ge)
  609. F64vec2_COMP(ngt)
  610. F64vec2_COMP(nge)
  611. F64vec2_COMP(neq)
  612. F64vec2_COMP(nlt)
  613. F64vec2_COMP(nle)
  614. #undef F64vec2_COMP
  615. friend F64vec2 simd_min(const F64vec2 &a,const F64vec2 &b) { return _mm_min_pd(a,b); }
  616. friend F64vec2 simd_max(const F64vec2 &a,const F64vec2 &b) { return _mm_max_pd(a,b); }
  617. #define F64vec2_COMI(op) friend int comi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
  618. F64vec2_COMI(eq)
  619. F64vec2_COMI(lt)
  620. F64vec2_COMI(le)
  621. F64vec2_COMI(gt)
  622. F64vec2_COMI(ge)
  623. F64vec2_COMI(neq)
  624. #undef F64vec2_COMI
  625. #define F64vec2_UCOMI(op) friend int ucomi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
  626. F64vec2_UCOMI(eq)
  627. F64vec2_UCOMI(lt)
  628. F64vec2_UCOMI(le)
  629. F64vec2_UCOMI(gt)
  630. F64vec2_UCOMI(ge)
  631. F64vec2_UCOMI(neq)
  632. #undef F64vec2_UCOMI
  633. #if defined(_ENABLE_VEC_DEBUG)
  634. friend std::ostream & operator<<(std::ostream & os,const F64vec2 &a) {
  635. double *dp = (double*)&a;
  636. os << " [1]:" << *(dp+1)
  637. << " [0]:" << *dp;
  638. return os;
  639. }
  640. #endif
  641. const double &operator[](int i) const {
  642. assert((0 <= i) && (i <= 1));
  643. double *dp = (double*)&vec;
  644. return *(dp+i);
  645. }
  646. double &operator[](int i) {
  647. assert((0 <= i) && (i <= 1));
  648. double *dp = (double*)&vec;
  649. return *(dp+i);
  650. }
  651. };
  652. inline F64vec2 unpack_low(const F64vec2 &a,const F64vec2 &b) { return _mm_unpacklo_pd(a,b); }
  653. inline F64vec2 unpack_high(const F64vec2 &a,const F64vec2 &b) { return _mm_unpackhi_pd(a,b); }
  654. inline int move_mask(const F64vec2 &a) { return _mm_movemask_pd(a); }
  655. inline void loadu(F64vec2 &a,double *p) { a = _mm_loadu_pd(p); }
  656. inline void storeu(double *p,const F64vec2 &a) { _mm_storeu_pd(p,a); }
  657. inline void store_nta(double *p,F64vec2 &a) { _mm_stream_pd(p,a); }
  658. #define F64vec2_SELECT(op) inline F64vec2 select_##op (const F64vec2 &a,const F64vec2 &b,const F64vec2 &c,const F64vec2 &d) { F64vec2 mask = _mm_cmp##op##_pd(a,b); return((mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); }
  659. F64vec2_SELECT(eq)
  660. F64vec2_SELECT(lt)
  661. F64vec2_SELECT(le)
  662. F64vec2_SELECT(gt)
  663. F64vec2_SELECT(ge)
  664. F64vec2_SELECT(neq)
  665. F64vec2_SELECT(nlt)
  666. F64vec2_SELECT(nle)
  667. #undef F64vec2_SELECT
  668. inline int F64vec2ToInt(const F64vec2 &a) { return _mm_cvttsd_si32(a); }
  669. inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a) { return _mm_cvtps_pd(a); }
  670. inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a) { return _mm_cvtpd_ps(a); }
  671. inline F64vec2 IntToF64vec2(const F64vec2 &a,int b) { return _mm_cvtsi32_sd(a,b); }
  672. #pragma pack(pop)
  673. #endif /* ifdef __SSE__ */
  674. #pragma pack(pop)
  675. #endif
  676. #endif