Bufferization issue with redundant allocation and copy

Question

Bufferization issue with redundant allocation and copy

yzhang93 opened this issue a month ago · comments

What happened?

There seems to be a bug in bufferization that creates redundant data allocation and copy.

IR snippet before bufferization

  %11 = scf.forall (%arg3, %arg4) = (0, 0) to (16, 16) step (8, 8) shared_outs(%arg5 = %8) -> (tensor<1x1x16x16x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %pack[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, 0, 0, %18] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x64xi32> to tensor<1x1x32x32xi32>
      %19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[0, 0, %arg4, %arg3, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %20 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%20 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [0, 0, 0, 8, 8, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_19: i32, %out: i32):
        %22 = arith.muli %in, %in_19 : i32
        %23 = arith.addi %out, %22 : i32
        linalg.yield %23 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %21 into %arg5[0, 0, %arg4, %arg3, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<1x1x16x16x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

After bufferization:

  scf.forall (%arg2, %arg3) = (0, 0) to (16, 16) step (8, 8) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %subview_11 = memref.subview %alloc_2[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_12 = memref.subview %alloc_1[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[0, 0, %arg3, %arg2, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      %alloc_14 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%alloc_14 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_14 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [0, 0, 0, 8, 8, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_16: i32, %out: i32):
        %5 = arith.muli %in, %in_16 : i32
        %6 = arith.addi %out, %5 : i32
        linalg.yield %6 : i32
      }
      %subview_15 = memref.subview %alloc_4[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %alloc_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

The ideal output after bufferization should be:

scf.forall (%arg2, %arg3) = (0, 0) to (16, 16) step (8, 8) {
  %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
  %subview_11 = memref.subview %alloc_2[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32>
  iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[2048, 2048, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
  %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
  %subview_12 = memref.subview %alloc_1[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32>
  iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 2048, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
  %subview_13 = memref.subview %alloc_3[0, 0, %arg3, %arg2, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [0, 0, 0, 8, 8, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
  ^bb0(%in: i32, %in_15: i32, %out: i32):
    %5 = arith.muli %in, %in_15 : i32
    %6 = arith.addi %out, %5 : i32
    linalg.yield %6 : i32
  }
  %subview_14 = memref.subview %alloc_4[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
  iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[4096, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

The full IR dump can be found https://gist.github.com/yzhang93/55f448368db32cccd2af31c730cc878a#file-gistfile1-txt-L342

Steps to reproduce your issue

No response

What component(s) does this issue relate to?

No response

Version information

No response

Additional context

No response