WOQ: enable lowp-mode=INT8 for NF4 weight (#3395)

Xia-Weiwen · web-flow · commit 1be1dc86a1bb · 2024-12-17T19:09:23.000-08:00
* WOQ: enable lowp-mode=INT8 for NF4 weight

* Remove -mavx512vbmi for avx512

* Remove -mavx512vbmi for avx512_vnni and avx512_bf16
diff --git a/cmake/Modules/FindAVX.cmake b/cmake/Modules/FindAVX.cmake
@@ -186,14 +186,14 @@ CHECK_SSE(CXX "AVX512_BF16" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mavx5
 
 # gcc start to support amx from version 11.2
 # https://gcc.gnu.org/onlinedocs/gcc-11.2.0/gcc/x86-Options.html#x86-Options
-CHECK_SSE(C "AMX" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mavx512bf16 -mfma\
+CHECK_SSE(C "AMX" " ;-mavx512f -mavx512dq -mavx512vbmi -mavx512vl -mavx512bw -mavx512bf16 -mfma\
  -mamx-tile -mamx-int8 -mamx-bf16;/arch:AVX512")
-CHECK_SSE(CXX "AMX" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mavx512bf16 -mfma\
+CHECK_SSE(CXX "AMX" " ;-mavx512f -mavx512dq -mavx512vbmi -mavx512vl -mavx512bw -mavx512bf16 -mfma\
  -mamx-tile -mamx-int8 -mamx-bf16;/arch:AVX512")
 
 # gcc starts to support avx512fp16 from version 12.1
 # https://gcc.gnu.org/onlinedocs/gcc-12.1.0/gcc/x86-Options.html#x86-Options
-CHECK_SSE(C "AVX512_FP16" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mavx512bf16 -mfma\
+CHECK_SSE(C "AVX512_FP16" " ;-mavx512f -mavx512dq -mavx512vbmi -mavx512vl -mavx512bw -mavx512bf16 -mfma\
  -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16;/arch:AVX512")
-CHECK_SSE(CXX "AVX512_FP16" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mavx512bf16 -mfma\
+CHECK_SSE(CXX "AVX512_FP16" " ;-mavx512f -mavx512dq -mavx512vbmi -mavx512vl -mavx512bw -mavx512bf16 -mfma\
  -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16;/arch:AVX512")
diff --git a/cmake/cpu/IsaCodegen.cmake b/cmake/cpu/IsaCodegen.cmake
@@ -44,7 +44,7 @@ if(CXX_AVX512_FP16_FOUND)
   else(MSVC)
     list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -D__AVX512F__ ${AVX512_OPTIMIZE_FLAGS} -DCPU_CAPABILITY_AVX512 \
     -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -DCPU_CAPABILITY_AMX \
-    -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni \
+    -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vbmi -mavx512vnni \
     -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16")
   endif(MSVC)
 else(CXX_AVX512_FP16_FOUND)
@@ -60,7 +60,7 @@ if(CXX_AMX_FOUND)
     list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512") # TODO: CHECK HERE
   else(MSVC)
     list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -D__AVX512F__ ${AVX512_OPTIMIZE_FLAGS} -DCPU_CAPABILITY_AVX512 \
-    -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma \
+    -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vbmi -mavx512vnni -mavx512bf16 -mfma \
     -mamx-tile -mamx-int8 -mamx-bf16")
   endif(MSVC)
 else(CXX_AMX_FOUND)
diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp
@@ -363,7 +363,7 @@ static size_t get_block_k(
     int64_t lowp_mode,
     int64_t group_size,
     int64_t K) {
-  size_t default_block_k = lowp_mode == 3 ? 128 : 64;
+  size_t default_block_k = lowp_mode == LOWP_MODE_INT8 ? 128 : 64;
   size_t block_k = group_size > 0
       ? std::min((size_t)group_size, default_block_k)
       : default_block_k;
@@ -389,7 +389,7 @@ at::Tensor woq_linear_pack_weight(
     size_t block_n = WOQ_N_BLOCK_SIZE;
     size_t block_k = get_block_k(weight_dtype, lowp_mode, group_size, K);
     if (weight_dtype == WOQ_DTYPE_INT4 || weight_dtype == WOQ_DTYPE_NF4) {
-      if (block_k % 4 && lowp_mode == 3) {
+      if (block_k % 4 && lowp_mode == LOWP_MODE_INT8) {
         // This case is not supported by kernel
         return weight;
       }
@@ -428,7 +428,7 @@ at::Tensor woq_linear_compute_compensation(
   TORCH_CHECK(weight.dim() == 2);
   auto N = weight.size(0), K = weight.size(1);
   if (N % WOQ_N_BLOCK_SIZE == 0 && weight_dtype == WOQ_DTYPE_INT8 &&
-      lowp_mode == 3) {
+      lowp_mode == LOWP_MODE_INT8) {
     size_t block_k = get_block_k(weight_dtype, lowp_mode, group_size, K);
     int64_t Nc = N / WOQ_N_BLOCK_SIZE, Kc = K / block_k;
     auto weight_reshaped = weight.reshape({Nc, WOQ_N_BLOCK_SIZE, Kc, block_k});
@@ -445,7 +445,7 @@ at::Tensor woq_linear_unpack_weight(
     const at::Tensor& weight,
     int64_t weight_dtype,
     int64_t lowp_mode) {
-  if (weight_dtype == WOQ_DTYPE_INT8 && lowp_mode == 3) {
+  if (weight_dtype == WOQ_DTYPE_INT8 && lowp_mode == LOWP_MODE_INT8) {
     // Unpack weight for INT8 GEMM.
     // weight is packed in 5d (Nc, Kc, block_k / 4, block_n, 4)
     // but viewd as 4d (Nc, Kc, block_k, block_n)
@@ -553,7 +553,8 @@ at::Tensor woq_linear_forward_v2(
       WOQ_DTYPE_MAP.find(weight_dtype) != WOQ_DTYPE_MAP.end(),
       "Unsupported weight dtype: ",
       weight_dtype);
-  if (WOQ_DTYPE_MAP.at(weight_dtype) == WOQ_DTYPE_INT8 && lowp_mode == 3) {
+  if (WOQ_DTYPE_MAP.at(weight_dtype) == WOQ_DTYPE_INT8 &&
+      lowp_mode == LOWP_MODE_INT8) {
     TORCH_CHECK(compensation.has_value() && compensation.value().defined());
   }
   static const at::Tensor empty_tensor = at::Tensor();
diff --git a/csrc/cpu/aten/kernels/WoqInt8GemmAPerKBlockKrnl.cpp b/csrc/cpu/aten/kernels/WoqInt8GemmAPerKBlockKrnl.cpp
@@ -98,9 +98,6 @@ at::Tensor woq_gemm_int8(
               auto quant_w_mode_ = std::get<1>(tuple);
               using act_type =
                   typename c10::impl::ScalarTypeToCPPType<act_dtype>::type;
-              TLA_ASSERT(
-                  qw_type == WOQ_DTYPE_INT4 || qw_type == WOQ_DTYPE_INT8,
-                  "LOWP_MODE_INT8 only support qw_type = INT4 or INT8");
               auto block_k = w_sizes[2];
               if (quant_block_k <= 0)
                 quant_block_k = block_k;
diff --git a/csrc/cpu/aten/kernels/WoqInt8GemmAPerMKBlockKrnl.cpp b/csrc/cpu/aten/kernels/WoqInt8GemmAPerMKBlockKrnl.cpp
@@ -98,9 +98,6 @@ at::Tensor woq_gemm_int8(
               auto quant_w_mode_ = std::get<1>(tuple);
               using act_type =
                   typename c10::impl::ScalarTypeToCPPType<act_dtype>::type;
-              TLA_ASSERT(
-                  qw_type == WOQ_DTYPE_INT4 || qw_type == WOQ_DTYPE_INT8,
-                  "LOWP_MODE_INT8 only support qw_type = INT4 or INT8");
               auto block_k = w_sizes[2];
               if (quant_block_k <= 0)
                 quant_block_k = block_k;
diff --git a/csrc/cpu/aten/kernels/WoqInt8GemmAPerMKrnl.cpp b/csrc/cpu/aten/kernels/WoqInt8GemmAPerMKrnl.cpp
@@ -98,9 +98,6 @@ at::Tensor woq_gemm_int8(
               auto quant_w_mode_ = std::get<1>(tuple);
               using act_type =
                   typename c10::impl::ScalarTypeToCPPType<act_dtype>::type;
-              TLA_ASSERT(
-                  qw_type == WOQ_DTYPE_INT4 || qw_type == WOQ_DTYPE_INT8,
-                  "LOWP_MODE_INT8 only support qw_type = INT4 or INT8");
               auto block_k = w_sizes[2];
               if (quant_block_k <= 0)
                 quant_block_k = block_k;
diff --git a/csrc/cpu/aten/kernels/WoqInt8GemmAPerTensorKrnl.cpp b/csrc/cpu/aten/kernels/WoqInt8GemmAPerTensorKrnl.cpp
@@ -98,9 +98,6 @@ at::Tensor woq_gemm_int8(
               auto quant_w_mode_ = std::get<1>(tuple);
               using act_type =
                   typename c10::impl::ScalarTypeToCPPType<act_dtype>::type;
-              TLA_ASSERT(
-                  qw_type == WOQ_DTYPE_INT4 || qw_type == WOQ_DTYPE_INT8,
-                  "LOWP_MODE_INT8 only support qw_type = INT4 or INT8");
               float scale_a;
               int32_t zp_a;
               bool is_sym_quant = !is_asymmetric_quant_a(quant_a_mode);
diff --git a/csrc/cpu/aten/utils/woq.h b/csrc/cpu/aten/utils/woq.h
@@ -525,6 +525,51 @@ inline std::array<__m256i, 2> load_sint4_as_int8(uint8_t* qB) {
   return {low, high};
 }
 
+// load nf4
+inline std::array<__m256i, 2> load_nf4_as_int8(uint8_t* qB) {
+  __m256i packed = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(qB));
+  const __m256i low_mask = _mm256_set1_epi8(0x0f);
+  __m256i high = _mm256_srli_epi16(packed, 4);
+  high = _mm256_and_si256(high, low_mask);
+  __m256i low = _mm256_and_si256(packed, low_mask);
+  const __m256i lut = _mm256_set_epi8(
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      127,
+      92,
+      71,
+      56,
+      43,
+      31,
+      20,
+      10,
+      0,
+      -12,
+      -23,
+      -36,
+      -50,
+      -67,
+      -88,
+      -127);
+  low = _mm256_permutexvar_epi8(low, lut);
+  high = _mm256_permutexvar_epi8(high, lut);
+  return {low, high};
+}
+
 #else
 inline std::array<__m256i, 2> load_zps_4vnni(int8_t* zps) {
   TLA_ASSERT(false, "not implemented");
@@ -541,6 +586,11 @@ inline std::array<__m256i, 2> load_sint4_as_int8(uint8_t* qB) {
   return std::array<__m256i, 2>();
 }
 
+inline std::array<__m256i, 2> load_nf4_as_int8(uint8_t* qB) {
+  TLA_ASSERT(false, "not implemented");
+  return std::array<__m256i, 2>();
+}
+
 #endif
 
 template <long N, bool sym_quant, typename T>
@@ -831,6 +881,10 @@ struct GemmMicroKernel<
     // Load scales and zps
     compile_time_for<COLS>::op([&](auto i) {
       vscales[i] = _mm512_loadu_ps(scales + i * 16);
+      if constexpr (qw_type == WOQ_DTYPE_NF4) {
+        const __m512 factor = _mm512_set1_ps(1.0f / 127.0f);
+        vscales[i] = _mm512_mul_ps(vscales[i], factor);
+      }
       // TODO(jgong5): should we use 512 or two 256 here?
       if constexpr (!sym_quant_w) {
         vzps[i] = combine_m256i(load_zps_4vnni(zps + i * 16));
@@ -859,8 +913,10 @@ struct GemmMicroKernel<
         if constexpr (!sym_quant_w) {
           vb[col] = combine_m256i(load_uint4_as_int8(pqB[k / 4][col * 16]));
           vb[col] = _mm512_sub_epi8(vb[col], vzps[col]);
-        } else {
+        } else if constexpr (qw_type == WOQ_DTYPE_INT4) {
           vb[col] = combine_m256i(load_sint4_as_int8(pqB[k / 4][col * 16]));
+        } else {
+          vb[col] = combine_m256i(load_nf4_as_int8(pqB[k / 4][col * 16]));
         }
         if constexpr (is_asymmetric_quant_a(quant_a_mode)) {
           vcompensate[col] =
@@ -1290,12 +1346,12 @@ struct Dequantize<half, ldb, N_GROUP_SIZE, qw_type, sym_quant_w> {
   }
 };
 
-template <long ldb, bool sym_quant_w>
+template <long ldb, int qw_type, bool sym_quant_w>
 struct Dequantize<
     int8_t,
     ldb,
     /*N_GROUP_SIZE*/ 16,
-    /*qw_type*/ WOQ_DTYPE_INT4,
+    qw_type,
     sym_quant_w> {
   template <int quant_a_mode>
   static inline void call(
@@ -1330,10 +1386,14 @@ struct Dequantize<
           auto [low, high] = load_uint4_as_int8(pqB[k][n]);
           vb_high = _mm256_sub_epi8(high, vzps_high);
           vb_low = _mm256_sub_epi8(low, vzps_low);
-        } else {
+        } else if constexpr (qw_type == WOQ_DTYPE_INT4) {
           auto [low, high] = load_sint4_as_int8(pqB[k][n]);
           vb_low = low;
           vb_high = high;
+        } else {
+          auto [low, high] = load_nf4_as_int8(pqB[k][n]);
+          vb_low = low;
+          vb_high = high;
         }
         if constexpr (is_asymmetric_quant_a(quant_a_mode)) {
           vcompensate[0] = _mm256_dpbusd_epi32(vcompensate[0], ones, vb_low);
@@ -1585,6 +1645,7 @@ template <
     long ldb,
     bool transA,
     bool ACC,
+    int qw_type,
     int quant_a_mode,
     int quant_w_mode,
     long PREFETCH_K_DIST>
@@ -1598,7 +1659,7 @@ class DequantGemmTPP<
     ldb,
     transA,
     ACC,
-    /*qw_type*/ WOQ_DTYPE_INT4,
+    qw_type,
     quant_a_mode,
     quant_w_mode,
     PREFETCH_K_DIST> {
@@ -1696,7 +1757,7 @@ class DequantGemmTPP<
                   ACC,
                   quant_a_mode,
                   PREFETCH_K_DIST>::
-                  template call<WOQ_DTYPE_INT4, sym_quant_w>(
+                  template call<qw_type, sym_quant_w>(
                       K,
                       qA[m],
                       lda,
@@ -1725,7 +1786,7 @@ class DequantGemmTPP<
                         ACC,
                         quant_a_mode,
                         PREFETCH_K_DIST>::
-                        template call<WOQ_DTYPE_INT4, sym_quant_w>(
+                        template call<qw_type, sym_quant_w>(
                             K,
                             qA[m],
                             lda,
@@ -1748,12 +1809,7 @@ class DequantGemmTPP<
       int8_t B[K / 4][N][4];
       int32_t qC[M][N];
       int32_t compensation[N];
-      Dequantize<
-          int8_t,
-          ldb,
-          N_GROUP_SIZE,
-          /*qw_type*/ WOQ_DTYPE_INT4,
-          sym_quant_w>::
+      Dequantize<int8_t, ldb, N_GROUP_SIZE, qw_type, sym_quant_w>::
           template call<quant_a_mode>(qB, K, N, zps, B[0][0], compensation);
       (*pgemm)((int8_t*)qA[0], B[0][0], qC[0], 1, no_tile_cfg);
       if constexpr (PREFETCH_K_DIST > 0) {
@@ -1782,11 +1838,14 @@ class DequantGemmTPP<
             }
           }
           float c = 0;
+          auto scale = scales[n];
+          if constexpr (qw_type == WOQ_DTYPE_NF4) {
+            scale *= (1.0f / 127.0f);
+          }
           if constexpr (is_asymmetric_quant_a(quant_a_mode)) {
-            c = (qC[m][n] - compensation[n] * (*zp_a_m)) * (*scale_a_m) *
-                scales[n];
+            c = (qC[m][n] - compensation[n] * (*zp_a_m)) * (*scale_a_m) * scale;
           } else {
-            c = (qC[m][n]) * (*scale_a_m) * scales[n];
+            c = (qC[m][n]) * (*scale_a_m) * scale;
           }
           if constexpr (ACC) {
             C[m * ldc + n] += c;
diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp
@@ -362,7 +362,7 @@ ContextLinearWoq create(
       lowp_mode,
       act_quant_mode,
       cache_weight_for_large_batch);
-  if (weight_dtype == WOQ_DTYPE_INT8 && lowp_mode == 3) {
+  if (weight_dtype == WOQ_DTYPE_INT8 && lowp_mode == LOWP_MODE_INT8) {
     auto compensation = woq_linear_compute_compensation(
         weight, weight_dtype, group_size, lowp_mode);
     context.cached_compensation_ =
diff --git a/docs/tutorials/features/isa_dynamic_dispatch.md b/docs/tutorials/features/isa_dynamic_dispatch.md
@@ -74,9 +74,9 @@ The CodeGen will copy each cpp files from **Kernel implementation**, and then ad
 >
 > AVX512_BF16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_BF16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -DCPU_CAPABILITY=AVX512_BF16 -DCPU_CAPABILITY_AVX512_BF16`
 >
-> AMX: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AMX.cpp -O3  -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -DCPU_CAPABILITY=AMX -DCPU_CAPABILITY_AMX`
+> AMX: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AMX.cpp -O3  -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vbmi -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -DCPU_CAPABILITY=AMX -DCPU_CAPABILITY_AMX`
 >
-> AVX512_FP16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_FP16.cpp -O3  -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16 -DCPU_CAPABILITY_AMX -DCPU_CAPABILITY=AVX512_FP16 -DCPU_CAPABILITY_AVX512_FP16`
+> AVX512_FP16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_FP16.cpp -O3  -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vbmi -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16 -DCPU_CAPABILITY_AMX -DCPU_CAPABILITY=AVX512_FP16 -DCPU_CAPABILITY_AVX512_FP16`
 ---
 
 >**Note:**
diff --git a/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py b/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py
@@ -12,6 +12,7 @@
     quantize_per_block,
     WoqWeightDtype,
     WoqWeightQScheme,
+    WoqLowpMode,
 )
 from intel_extension_for_pytorch.nn.utils._model_convert import (
     prepack_awq_weight,
@@ -123,7 +124,7 @@ def from_float(cls, mod, scales=None, zero_points=None):
         # otherwise, it may overflow when we subtract zero points from int8 weight.
         sym_quant = qconfig.weight_qscheme == WoqWeightQScheme.SYMMETRIC
         if dtype == WoqWeightDtype.NF4 or (
-            dtype == WoqWeightDtype.INT8 and lowp_mode == 3
+            dtype == WoqWeightDtype.INT8 and lowp_mode == WoqLowpMode.INT8
         ):
             assert (
                 sym_quant is True
diff --git a/intel_extension_for_pytorch/quantization/_quantize_utils.py b/intel_extension_for_pytorch/quantization/_quantize_utils.py
diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py

Original file line number	Diff line number	Diff line change
`@@ -74,9 +74,9 @@ The CodeGen will copy each cpp files from Kernel implementation, and then ad`
`74`	`74`	`>`
`75`	`75`	> AVX512_BF16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_BF16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -DCPU_CAPABILITY=AVX512_BF16 -DCPU_CAPABILITY_AVX512_BF16`
`76`	`76`	`>`
`77`		-> AMX: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AMX.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -DCPU_CAPABILITY=AMX -DCPU_CAPABILITY_AMX`
	`77`	+> AMX: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AMX.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vbmi -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -DCPU_CAPABILITY=AMX -DCPU_CAPABILITY_AMX`
`78`	`78`	`>`
`79`		-> AVX512_FP16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_FP16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16 -DCPU_CAPABILITY_AMX -DCPU_CAPABILITY=AVX512_FP16 -DCPU_CAPABILITY_AVX512_FP16`
	`79`	+> AVX512_FP16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_FP16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vbmi -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16 -DCPU_CAPABILITY_AMX -DCPU_CAPABILITY=AVX512_FP16 -DCPU_CAPABILITY_AVX512_FP16`
`80`	`80`	`---`
`81`	`81`
`82`	`82`	`>Note:`