huggingface / datasets

🤗 The largest hub of ready-to-use datasets for ML models with fast, easy-to-use and efficient data manipulation tools

Home Page:https://huggingface.co/docs/datasets

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

JSON loader implicitly coerces floats to integers

albertvillanova opened this issue · comments

The JSON loader implicitly coerces floats to integers.

The column values [0.0, 1.0, 2.0] are coerced to [0, 1, 2].
See CI error in dataset-viewer: https://github.com/huggingface/dataset-viewer/actions/runs/9290164936/job/25576926446

 =================================== FAILURES ===================================
___________________________ test_statistics_endpoint ___________________________

normal_user_public_json_dataset = 'DVUser/tmp-dataset-17170199043860'

    def test_statistics_endpoint(normal_user_public_json_dataset: str) -> None:
        dataset = normal_user_public_json_dataset
        config, split = get_default_config_split()
        statistics_response = poll_until_ready_and_assert(
            relative_url=f"/statistics?dataset={dataset}&config={config}&split={split}",
            check_x_revision=True,
            dataset=dataset,
        )
    
        content = statistics_response.json()
        assert len(content) == 3
        assert sorted(content) == ["num_examples", "partial", "statistics"], statistics_response
        statistics = content["statistics"]
        num_examples = content["num_examples"]
        partial = content["partial"]
    
        assert isinstance(statistics, list), statistics
        assert len(statistics) == 6
        assert num_examples == 4
        assert partial is False
    
        string_label_column = statistics[0]
        assert "column_name" in string_label_column
        assert "column_statistics" in string_label_column
        assert "column_type" in string_label_column
        assert string_label_column["column_name"] == "col_1"
        assert string_label_column["column_type"] == "string_label"  # 4 unique values -> label
        assert isinstance(string_label_column["column_statistics"], dict)
        assert string_label_column["column_statistics"] == {
            "nan_count": 0,
            "nan_proportion": 0.0,
            "no_label_count": 0,
            "no_label_proportion": 0.0,
            "n_unique": 4,
            "frequencies": {
                "There goes another one.": 1,
                "Vader turns round and round in circles as his ship spins into space.": 1,
                "We count thirty Rebel ships, Lord Vader.": 1,
                "The wingman spots the pirateship coming at him and warns the Dark Lord": 1,
            },
        }
    
        int_column = statistics[1]
        assert "column_name" in int_column
        assert "column_statistics" in int_column
        assert "column_type" in int_column
        assert int_column["column_name"] == "col_2"
        assert int_column["column_type"] == "int"
        assert isinstance(int_column["column_statistics"], dict)
        assert int_column["column_statistics"] == {
            "histogram": {"bin_edges": [0, 1, 2, 3, 3], "hist": [1, 1, 1, 1]},
            "max": 3,
            "mean": 1.5,
            "median": 1.5,
            "min": 0,
            "nan_count": 0,
            "nan_proportion": 0.0,
            "std": 1.29099,
        }
    
        float_column = statistics[2]
        assert "column_name" in float_column
        assert "column_statistics" in float_column
        assert "column_type" in float_column
        assert float_column["column_name"] == "col_3"
>       assert float_column["column_type"] == "float"
E       AssertionError: assert 'int' == 'float'
E         - float
E         + int

tests/test_14_statistics.py:72: AssertionError

=========================== short test summary info ============================
FAILED tests/test_14_statistics.py::test_statistics_endpoint - AssertionError: assert 'int' == 'float'
  - float
  + int

This bug was introduced after:

We have reported the issue to pandas: