avx512bf16intrin.h 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. /* Copyright (C) 2019-2022 Free Software Foundation, Inc.
  2. This file is part of GCC.
  3. GCC is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 3, or (at your option)
  6. any later version.
  7. GCC is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. Under Section 7 of GPL version 3, you are granted additional
  12. permissions described in the GCC Runtime Library Exception, version
  13. 3.1, as published by the Free Software Foundation.
  14. You should have received a copy of the GNU General Public License and
  15. a copy of the GCC Runtime Library Exception along with this program;
  16. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  17. <http://www.gnu.org/licenses/>. */
  18. #ifndef _IMMINTRIN_H_INCLUDED
  19. #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
  20. #endif
  21. #ifndef _AVX512BF16INTRIN_H_INCLUDED
  22. #define _AVX512BF16INTRIN_H_INCLUDED
  23. #ifndef __AVX512BF16__
  24. #pragma GCC push_options
  25. #pragma GCC target("avx512bf16")
  26. #define __DISABLE_AVX512BF16__
  27. #endif /* __AVX512BF16__ */
  28. /* Internal data types for implementing the intrinsics. */
  29. typedef short __v32bh __attribute__ ((__vector_size__ (64)));
  30. /* The Intel API is flexible enough that we must allow aliasing with other
  31. vector types, and their scalar components. */
  32. typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
  33. /* Convert One BF16 Data to One Single Float Data. */
  34. extern __inline float
  35. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  36. _mm_cvtsbh_ss (__bfloat16 __A)
  37. {
  38. union{ float a; unsigned int b;} __tmp;
  39. __tmp.b = ((unsigned int)(__A)) << 16;
  40. return __tmp.a;
  41. }
  42. /* vcvtne2ps2bf16 */
  43. extern __inline __m512bh
  44. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  45. _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
  46. {
  47. return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
  48. }
  49. extern __inline __m512bh
  50. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  51. _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
  52. {
  53. return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
  54. }
  55. extern __inline __m512bh
  56. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  57. _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
  58. {
  59. return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
  60. }
  61. /* vcvtneps2bf16 */
  62. extern __inline __m256bh
  63. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  64. _mm512_cvtneps_pbh (__m512 __A)
  65. {
  66. return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A);
  67. }
  68. extern __inline __m256bh
  69. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  70. _mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C)
  71. {
  72. return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B);
  73. }
  74. extern __inline __m256bh
  75. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  76. _mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B)
  77. {
  78. return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A);
  79. }
  80. /* vdpbf16ps */
  81. extern __inline __m512
  82. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  83. _mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C)
  84. {
  85. return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C);
  86. }
  87. extern __inline __m512
  88. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  89. _mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D)
  90. {
  91. return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B);
  92. }
  93. extern __inline __m512
  94. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  95. _mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D)
  96. {
  97. return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A);
  98. }
  99. extern __inline __m512
  100. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  101. _mm512_cvtpbh_ps (__m256bh __A)
  102. {
  103. return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 (
  104. (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16));
  105. }
  106. extern __inline __m512
  107. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  108. _mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A)
  109. {
  110. return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 (
  111. (__m512i)_mm512_maskz_cvtepi16_epi32 (
  112. (__mmask16)__U, (__m256i)__A), 16));
  113. }
  114. extern __inline __m512
  115. __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  116. _mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A)
  117. {
  118. return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 (
  119. (__m512i)__S, (__mmask16)__U,
  120. (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)));
  121. }
  122. #ifdef __DISABLE_AVX512BF16__
  123. #undef __DISABLE_AVX512BF16__
  124. #pragma GCC pop_options
  125. #endif /* __DISABLE_AVX512BF16__ */
  126. #endif /* _AVX512BF16INTRIN_H_INCLUDED */