iree-org / iree

A retargetable MLIR-based machine learning compiler and runtime toolkit.

Home Page:

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Bufferization issue with redundant allocation and copy

yzhang93 opened this issue · comments

What happened?

There seems to be a bug in bufferization that creates redundant data allocation and copy.

IR snippet before bufferization

  %11 = scf.forall (%arg3, %arg4) = (0, 0) to (16, 16) step (8, 8) shared_outs(%arg5 = %8) -> (tensor<1x1x16x16x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %pack[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, 0, 0, %18] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x64xi32> to tensor<1x1x32x32xi32>
      %19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[0, 0, %arg4, %arg3, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %20 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%20 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [0, 0, 0, 8, 8, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_19: i32, %out: i32):
        %22 = arith.muli %in, %in_19 : i32
        %23 = arith.addi %out, %22 : i32
        linalg.yield %23 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %21 into %arg5[0, 0, %arg4, %arg3, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<1x1x16x16x4x4xi32>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

After bufferization:

  scf.forall (%arg2, %arg3) = (0, 0) to (16, 16) step (8, 8) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %subview_11 = memref.subview %alloc_2[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_12 = memref.subview %alloc_1[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[0, 0, %arg3, %arg2, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      %alloc_14 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%alloc_14 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_14 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [0, 0, 0, 8, 8, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_16: i32, %out: i32):
        %5 = arith.muli %in, %in_16 : i32
        %6 = arith.addi %out, %5 : i32
        linalg.yield %6 : i32
      %subview_15 = memref.subview %alloc_4[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %alloc_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

The ideal output after bufferization should be:

scf.forall (%arg2, %arg3) = (0, 0) to (16, 16) step (8, 8) {
  %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
  %subview_11 = memref.subview %alloc_2[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32>
  iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
  %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
  %subview_12 = memref.subview %alloc_1[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32>
  iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
  %subview_13 = memref.subview %alloc_3[0, 0, %arg3, %arg2, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [0, 0, 0, 8, 8, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
  ^bb0(%in: i32, %in_15: i32, %out: i32):
    %5 = arith.muli %in, %in_15 : i32
    %6 = arith.addi %out, %5 : i32
    linalg.yield %6 : i32
  %subview_14 = memref.subview %alloc_4[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
  iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

The full IR dump can be found

Steps to reproduce your issue

No response

What component(s) does this issue relate to?

No response

Version information

No response

Additional context

No response