JSON loader implicitly coerces floats to integers
albertvillanova opened this issue · comments
The JSON loader implicitly coerces floats to integers.
The column values [0.0, 1.0, 2.0]
are coerced to [0, 1, 2]
.
See CI error in dataset-viewer: https://github.com/huggingface/dataset-viewer/actions/runs/9290164936/job/25576926446
=================================== FAILURES ===================================
___________________________ test_statistics_endpoint ___________________________
normal_user_public_json_dataset = 'DVUser/tmp-dataset-17170199043860'
def test_statistics_endpoint(normal_user_public_json_dataset: str) -> None:
dataset = normal_user_public_json_dataset
config, split = get_default_config_split()
statistics_response = poll_until_ready_and_assert(
relative_url=f"/statistics?dataset={dataset}&config={config}&split={split}",
check_x_revision=True,
dataset=dataset,
)
content = statistics_response.json()
assert len(content) == 3
assert sorted(content) == ["num_examples", "partial", "statistics"], statistics_response
statistics = content["statistics"]
num_examples = content["num_examples"]
partial = content["partial"]
assert isinstance(statistics, list), statistics
assert len(statistics) == 6
assert num_examples == 4
assert partial is False
string_label_column = statistics[0]
assert "column_name" in string_label_column
assert "column_statistics" in string_label_column
assert "column_type" in string_label_column
assert string_label_column["column_name"] == "col_1"
assert string_label_column["column_type"] == "string_label" # 4 unique values -> label
assert isinstance(string_label_column["column_statistics"], dict)
assert string_label_column["column_statistics"] == {
"nan_count": 0,
"nan_proportion": 0.0,
"no_label_count": 0,
"no_label_proportion": 0.0,
"n_unique": 4,
"frequencies": {
"There goes another one.": 1,
"Vader turns round and round in circles as his ship spins into space.": 1,
"We count thirty Rebel ships, Lord Vader.": 1,
"The wingman spots the pirateship coming at him and warns the Dark Lord": 1,
},
}
int_column = statistics[1]
assert "column_name" in int_column
assert "column_statistics" in int_column
assert "column_type" in int_column
assert int_column["column_name"] == "col_2"
assert int_column["column_type"] == "int"
assert isinstance(int_column["column_statistics"], dict)
assert int_column["column_statistics"] == {
"histogram": {"bin_edges": [0, 1, 2, 3, 3], "hist": [1, 1, 1, 1]},
"max": 3,
"mean": 1.5,
"median": 1.5,
"min": 0,
"nan_count": 0,
"nan_proportion": 0.0,
"std": 1.29099,
}
float_column = statistics[2]
assert "column_name" in float_column
assert "column_statistics" in float_column
assert "column_type" in float_column
assert float_column["column_name"] == "col_3"
> assert float_column["column_type"] == "float"
E AssertionError: assert 'int' == 'float'
E - float
E + int
tests/test_14_statistics.py:72: AssertionError
=========================== short test summary info ============================
FAILED tests/test_14_statistics.py::test_statistics_endpoint - AssertionError: assert 'int' == 'float'
- float
+ int
This bug was introduced after:
We have reported the issue to pandas: