[flow] AnnotateDispatches does not spell out linalg ops in LinalgExt fusion

Since we are working on LinalgExt fusion, we want to make entry_function name more descriptive. Otherwise, we can't figure out if this is a single LinalgExt op or fusion case when we're looking at traces. E.g., the below is summarized as .*winograd_input_transform_1x22x22x512x8x8xf32, but the generic op is missing. Perhaps we should make it winograd_input_transform_1x22x22x512x8x8xf32_generic/transpose/broadcast.

The logics can be found in AnnotateDispatches.cpp.

hal.executable public @main$async_dispatch_14 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d"}>) {
    hal.executable.export public @main$async_dispatch_14_winograd_input_transform_1x22x22x512x8x8xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    builtin.module {
      func.func @main$async_dispatch_14_winograd_input_transform_1x22x22x512x8x8xf32() {
        %0 = hal.interface.constant.load[0] : i32
        %1 = hal.interface.constant.load[1] : i32
        %2 = arith.index_castui %0 : i32 to index
        %3 = arith.index_castui %1 : i32 to index
        %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x512x130x130xf32>>
        %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<8x8x1x22x22x512xbf16>>
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [1, 512, 130, 130], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x512x130x130xf32>> -> tensor<1x512x130x130xf32>
        %7 = tensor.empty() : tensor<8x8x1x22x22x512xbf16>
        %8 = tensor.empty() : tensor<1x22x22x512x8x8xf32>
        %9 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([2, 3]) input_tile_dimensions([4, 5]) ins(%6 : tensor<1x512x130x130xf32>) outs(%8 : tensor<1x22x22x512x8x8xf32>) -> tensor<1x22x22x512x8x8xf32>
        %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d3, d4, d5, d0, d1)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%9 : tensor<1x22x22x512x8x8xf32>) outs(%7 : tensor<8x8x1x22x22x512xbf16>) {
        ^bb0(%in: f32, %out: bf16):
          %11 = arith.truncf %in : f32 to bf16
          linalg.yield %11 : bf16
        } -> tensor<8x8x1x22x22x512xbf16> %10, %5, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 1, 22, 22, 512], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x1x22x22x512xbf16> -> !flow.dispatch.tensor<writeonly:tensor<8x8x1x22x22x512xbf16>>