avx512bf16vlintrin.h 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. /* Copyright (C) 2019-2022 Free Software Foundation, Inc.
  2. This file is part of GCC.
  3. GCC is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 3, or (at your option)
  6. any later version.
  7. GCC is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. Under Section 7 of GPL version 3, you are granted additional
  12. permissions described in the GCC Runtime Library Exception, version
  13. 3.1, as published by the Free Software Foundation.
  14. You should have received a copy of the GNU General Public License and
  15. a copy of the GCC Runtime Library Exception along with this program;
  16. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  17. <http://www.gnu.org/licenses/>. */
  18. #ifndef _IMMINTRIN_H_INCLUDED
  19. #error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead."
  20. #endif
  21. #ifndef _AVX512BF16VLINTRIN_H_INCLUDED
  22. #define _AVX512BF16VLINTRIN_H_INCLUDED
  23. #if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
  24. #pragma GCC push_options
  25. #pragma GCC target("avx512bf16,avx512vl")
  26. #define __DISABLE_AVX512BF16VL__
  27. #endif /* __AVX512BF16__ */
  28. /* Internal data types for implementing the intrinsics. */
  29. typedef short __v16bh __attribute__ ((__vector_size__ (32)));
  30. typedef short __v8bh __attribute__ ((__vector_size__ (16)));
  31. /* The Intel API is flexible enough that we must allow aliasing with other
  32. vector types, and their scalar components. */
  33. typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
  34. typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
  35. typedef unsigned short __bfloat16;
  36. /* vcvtne2ps2bf16 */
  37. extern __inline __m256bh
  38. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  39. _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
  40. {
  41. return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
  42. }
  43. extern __inline __m256bh
  44. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  45. _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
  46. {
  47. return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
  48. }
  49. extern __inline __m256bh
  50. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  51. _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
  52. {
  53. return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
  54. }
  55. extern __inline __m128bh
  56. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  57. _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
  58. {
  59. return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
  60. }
  61. extern __inline __m128bh
  62. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  63. _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
  64. {
  65. return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
  66. }
  67. extern __inline __m128bh
  68. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  69. _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
  70. {
  71. return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
  72. }
  73. /* vcvtneps2bf16 */
  74. extern __inline __m128bh
  75. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  76. _mm256_cvtneps_pbh (__m256 __A)
  77. {
  78. return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A);
  79. }
  80. extern __inline __m128bh
  81. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  82. _mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C)
  83. {
  84. return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B);
  85. }
  86. extern __inline __m128bh
  87. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  88. _mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
  89. {
  90. return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A);
  91. }
  92. extern __inline __m128bh
  93. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  94. _mm_cvtneps_pbh (__m128 __A)
  95. {
  96. return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A);
  97. }
  98. extern __inline __m128bh
  99. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  100. _mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
  101. {
  102. return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
  103. }
  104. extern __inline __m128bh
  105. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  106. _mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B)
  107. {
  108. return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A);
  109. }
  110. /* vdpbf16ps */
  111. extern __inline __m256
  112. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  113. _mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C)
  114. {
  115. return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C);
  116. }
  117. extern __inline __m256
  118. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  119. _mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D)
  120. {
  121. return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B);
  122. }
  123. extern __inline __m256
  124. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  125. _mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D)
  126. {
  127. return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A);
  128. }
  129. extern __inline __m128
  130. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  131. _mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C)
  132. {
  133. return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C);
  134. }
  135. extern __inline __m128
  136. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  137. _mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D)
  138. {
  139. return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B);
  140. }
  141. extern __inline __m128
  142. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  143. _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
  144. {
  145. return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
  146. }
  147. extern __inline __bfloat16
  148. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  149. _mm_cvtness_sbh (float __A)
  150. {
  151. __v4sf __V = {__A, 0, 0, 0};
  152. __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
  153. (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
  154. return __R[0];
  155. }
  156. extern __inline __m128
  157. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  158. _mm_cvtpbh_ps (__m128bh __A)
  159. {
  160. return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
  161. (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
  162. }
  163. extern __inline __m256
  164. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  165. _mm256_cvtpbh_ps (__m128bh __A)
  166. {
  167. return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
  168. (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
  169. }
  170. extern __inline __m128
  171. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  172. _mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
  173. {
  174. return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
  175. (__m128i)_mm_maskz_cvtepi16_epi32 (
  176. (__mmask8)__U, (__m128i)__A), 16));
  177. }
  178. extern __inline __m256
  179. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  180. _mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
  181. {
  182. return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
  183. (__m256i)_mm256_maskz_cvtepi16_epi32 (
  184. (__mmask8)__U, (__m128i)__A), 16));
  185. }
  186. extern __inline __m128
  187. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  188. _mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
  189. {
  190. return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
  191. (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
  192. (__m128i)__A), 16));
  193. }
  194. extern __inline __m256
  195. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  196. _mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
  197. {
  198. return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
  199. (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
  200. (__m128i)__A), 16));
  201. }
  202. #ifdef __DISABLE_AVX512BF16VL__
  203. #undef __DISABLE_AVX512BF16VL__
  204. #pragma GCC pop_options
  205. #endif /* __DISABLE_AVX512BF16VL__ */
  206. #endif /* _AVX512BF16VLINTRIN_H_INCLUDED */