[LLGA] Replace aten::type_as with aten::to to support a corner-case (#3030)

sanchitintel · web-flow · commit ad29b2346fe0 · 2024-07-04T14:34:30.000+08:00
* Replace aten::type_as with aten::to to support a corner-case

 the issue here is that an intermediate output of one of the partitions (output of add op) is to be used as an input to type_as later in the graph.

It's easier to work around this issue by replacing all aten::type_as nodes with aten::to nodes.
Another potential fix is to add LLGA End node after such intermediate outputs that are to be reused later in the graph, but that solution would require more extensive changes, since no info regarding future use of intermediate outputs would be available to IPEX/PyTorch while creating partitions at subgraph level.

* Update graph_helper.cpp

* Update test_jit_llga_fuser.py
diff --git a/csrc/cpu/jit/codegen/onednn/graph_helper.cpp b/csrc/cpu/jit/codegen/onednn/graph_helper.cpp
@@ -739,6 +739,20 @@ bool LlgaGraphHelper::isSingleQuantDequantTo(Node* n) {
       n->kind() != Symbol::aten("quantize_per_channel") &&
       n->kind() != Symbol::aten("dequantize") && n->kind() != aten::to)
     return false;
+  // Check if aten::to is used for non-quantized case
+  if (n->kind() == aten::to) {
+    auto input_dtype = n->input(0)->type()->expect<TensorType>()->scalarType();
+    auto output_dtype =
+        n->outputs()[0]->type()->expect<TensorType>()->scalarType();
+    if (input_dtype.has_value() && output_dtype.has_value()) {
+      if ((input_dtype.value() == at::ScalarType::Float ||
+           input_dtype.value() == at::ScalarType::BFloat16) &&
+          (output_dtype.value() == at::ScalarType::Float ||
+           output_dtype.value() == at::ScalarType::BFloat16)) {
+        return false;
+      }
+    }
+  }
   if (!opToOwningPartition_.has(n))
     return false;
 
diff --git a/csrc/cpu/jit/codegen/onednn/prepare_binary.cpp b/csrc/cpu/jit/codegen/onednn/prepare_binary.cpp
@@ -90,6 +90,28 @@ void handleBinaryOpInputs(Node* node, int first_input, int second_input) {
   }
 }
 
+static void ReplaceTypeAsWithTo(Block* block) {
+  for (auto node : block->nodes()) {
+    for (auto sub : node->blocks()) {
+      ReplaceTypeAsWithTo(sub);
+    }
+
+    if (node->kind() == aten::type_as) {
+      auto nodeOutputTypePtr = node->output()->type()->expect<TensorType>();
+      c10::optional<at::ScalarType> outputDtype =
+          nodeOutputTypePtr->scalarType();
+      if (outputDtype.has_value()) {
+        auto g = node->prev()->owningGraph();
+        auto replacementNodeOutput =
+            g->insert(aten::to, {node->input(0), outputDtype.value()});
+        replacementNodeOutput->setType(
+            nodeOutputTypePtr->withScalarType(outputDtype.value()));
+        node->outputs()[0]->replaceAllUsesWith(replacementNodeOutput);
+      }
+    }
+  }
+}
+
 static void ConvertScalarToTensor(Block* block) {
   for (auto node : block->nodes()) {
     for (auto sub : node->blocks()) {
@@ -246,6 +268,8 @@ void PrepareBinaryForLLGA(const std::shared_ptr<Graph>& graph) {
   EliminateDeadCode(graph);
   // ConvertScalarToTensor must be placed after EliminateIdentityMulAddDiv
   replaceWithSelectOp(graph->block());
+  ReplaceTypeAsWithTo(graph->block());
+  EliminateDeadCode(graph);
   ConvertScalarToTensor(graph->block());
   // TODO: after conv-bn folding, bias will become bias? (Optional) after this
   // pass and will lose it when using mustNotBeNone to check Optional Bias
diff --git a/tests/cpu/test_jit_llga_fuser.py b/tests/cpu/test_jit_llga_fuser.py
@@ -222,7 +222,8 @@ def forward(self, x):
         m = M()
         x = torch.rand(8, 12, 12, 12)
         graph, _ = self.checkTrace(m, [x])
-        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+        # One partition for softmax & another for TypeCast
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
 
     def _gen_binary_inputs(self, gen_permute=True):
         for xshape, yshape in [
@@ -507,8 +508,8 @@ def forward(self, x):
             m = M(dst_dtype)
 
             graph, _ = self.checkTrace(m, [x])
-            # we do not rewrite single to
-            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+            # Even a single TypeCast is mapped to oneDNN Graph
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
 
     @llga_fp32_bf16_test_env
     def test_typecheck(self):