apache / iceberg

Apache Iceberg

Home Page:https://iceberg.apache.org/

Repository from Github https://github.comapache/icebergRepository from Github https://github.comapache/iceberg

Snapshot action creates a V2 table with V1 metadata

Fokko opened this issue · comments

Apache Iceberg version

1.10.0 (latest release)

Query engine

Spark

Please describe the bug 🐞

Noticed this while migrating to the Rust based Avro metadata parser.

Reproducible by the following integration test: https://github.com/apache/iceberg-python/blob/1339412e6480151c1fe2d275986867583e0facff/tests/integration/test_hive_migration.py#L27

Contains the following metadata:

{
    "format-version": 2,
    "table-uuid": "0f07a6df-6c48-45f3-99da-e8049828f365",
    "location": "s3a://warehouse/hive/test_migrate_table_1758576517",
    "last-sequence-number": 1,
    "last-updated-ms": 1758576518885,
    "last-column-id": 2,
    "current-schema-id": 0,
    "schemas": [
        {
            "type": "struct",
            "schema-id": 0,
            "fields": [
                {
                    "id": 1,
                    "name": "number",
                    "required": false,
                    "type": "int"
                },
                {
                    "id": 2,
                    "name": "dt",
                    "required": false,
                    "type": "date"
                }
            ]
        }
    ],
    "default-spec-id": 0,
    "partition-specs": [
        {
            "spec-id": 0,
            "fields": [
                {
                    "name": "dt",
                    "transform": "identity",
                    "source-id": 2,
                    "field-id": 1000
                }
            ]
        }
    ],
    "last-partition-id": 1000,
    "default-sort-order-id": 0,
    "sort-orders": [
        {
            "order-id": 0,
            "fields": []
        }
    ],
    "properties": {
        "schema.name-mapping.default": "[ {\n  \"field-id\" : 1,\n  \"names\" : [ \"number\" ]\n}, {\n  \"field-id\" : 2,\n  \"names\" : [ \"dt\" ]\n} ]",
        "gc.enabled": "false",
        "snapshot": "true",
        "write.parquet.compression-codec": "zstd"
    },
    "current-snapshot-id": 4929450008479086000,
    "refs": {
        "main": {
            "snapshot-id": 4929450008479086000,
            "type": "branch"
        }
    },
    "snapshots": [
        {
            "sequence-number": 1,
            "snapshot-id": 4929450008479086000,
            "timestamp-ms": 1758576518885,
            "summary": {
                "operation": "append",
                "added-data-files": "6",
                "added-records": "6",
                "total-records": "6",
                "total-files-size": "0",
                "total-data-files": "6",
                "total-delete-files": "0",
                "total-position-deletes": "0",
                "total-equality-deletes": "0",
                "engine-version": "3.5.6",
                "app-id": "local-1758576417383",
                "engine-name": "spark",
                "iceberg-version": "Apache Iceberg 1.10.0 (commit 2114bf631e49af532d66e2ce148ee49dd1dd1f1f)"
            },
            "manifest-list": "s3a://warehouse/hive/test_migrate_table_1758576517/metadata/snap-4929450008479085298-1-d95a1f0e-a6e4-4e1f-a343-73465a12ccc8.avro",
            "schema-id": 0
        }
    ],
    "statistics": [],
    "partition-statistics": [],
    "snapshot-log": [
        {
            "timestamp-ms": 1758576518885,
            "snapshot-id": 4929450008479086000
        }
    ],
    "metadata-log": []
}

Manifest-list:

{
    "manifest_path": "s3a://warehouse/hive/test_migrate_table_1758576517/metadata/stage-202-task-991-manifest-5a9ff02d-06f8-4818-b95a-dc1686367f90.avro",
    "manifest_length": 6060,
    "partition_spec_id": 0,
    "content": 0,
    "sequence_number": 1,
    "min_sequence_number": 1,
    "added_snapshot_id": 4929450008479086000,
    "added_files_count": 6,
    "existing_files_count": 0,
    "deleted_files_count": 0,
    "added_rows_count": 6,
    "existing_rows_count": 0,
    "deleted_rows_count": 0,
    "partitions": {
        "array": [
            {
                "contains_null": false,
                "contains_nan": {
                    "boolean": false
                },
                "lower_bound": {
                    "bytes": "1J\u0000\u0000"
                },
                "upper_bound": {
                    "bytes": "�K\u0000\u0000"
                }
            }
        ]
    },
    "key_metadata": null
}

Manifest:

{
    "status": 1,
    "snapshot_id": null,
    "data_file": {
        "file_path": "s3a://warehouse/hive/test_migrate_table_hive_1758576517/dt=2022-01-01/part-00000-f0601235-03dd-4656-8bec-34a162fa1edb.c000.snappy.parquet",
        "file_format": "PARQUET",
        "partition": {
            "dt": {
                "int": 18993
            }
        },
        "record_count": 1,
        "file_size_in_bytes": 458,
        "block_size_in_bytes": 67108864,
        "column_sizes": {
            "array": []
        },
        "value_counts": {
            "array": []
        },
        "null_value_counts": {
            "array": []
        },
        "nan_value_counts": {
            "array": []
        },
        "lower_bounds": {
            "array": []
        },
        "upper_bounds": {
            "array": []
        },
        "key_metadata": null,
        "split_offsets": null,
        "sort_order_id": {
            "int": 0
        }
    }
}

The problem here is that the manifest list is V2 (there is a sequence number), but the manifest itself is V1 (snapshot inheritance is enabled):

avro-tools getmeta stage-202-task-991-manifest-5a9ff02d-06f8-4818-b95a-dc1686367f90.avro
...
format-version	1

Willingness to contribute

  • I can contribute a fix for this bug independently
  • I would be willing to contribute a fix for this bug with guidance from the Iceberg community
  • I cannot contribute a fix for this bug at this time