cannot split dataset when using load_dataset
cybest0608 opened this issue · comments
Describe the bug
when I use load_dataset methods to load mozilla-foundation/common_voice_7_0, it can successfully download and extracted the dataset but It cannot generating the arrow document,
This bug happened in my server, my laptop, so as #6906 , but it won't happen in the google colab. I work for it for days, even I load the datasets from local path, it can Generating train split and validation split but bug happen again in test split.
Steps to reproduce the bug
from datasets import load_dataset, load_metric, Audio
common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "ja", split="train", token=selftoken, trust_remote_code=True)
Expected behavior
{
"name": "ValueError",
"message": "Instruction \"train\" corresponds to no data!",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 3
1 from datasets import load_dataset, load_metric, Audio
----> 3 common_voice_train = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"ja\", split=\"train\",token='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')#,trust_remote_code=True)#,streaming=True)
4 common_voice_test = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"ja\", split=\"test\",token='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')#,trust_remote_code=True)#,streaming=True)
File c:\\Users\\cybes\\.conda\\envs\\ECoG\\lib\\site-packages\\datasets\\load.py:2626, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2622 # Build dataset for splits
2623 keep_in_memory = (
2624 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2625 )
-> 2626 ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
2627 # Rename and cast features to match task schema
2628 if task is not None:
2629 # To avoid issuing the same warning twice
File c:\\Users\\cybes\\.conda\\envs\\ECoG\\lib\\site-packages\\datasets\\builder.py:1266, in DatasetBuilder.as_dataset(self, split, run_post_process, verification_mode, ignore_verifications, in_memory)
1263 verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
1265 # Create a dataset for each of the given splits
-> 1266 datasets = map_nested(
1267 partial(
1268 self._build_single_dataset,
1269 run_post_process=run_post_process,
1270 verification_mode=verification_mode,
1271 in_memory=in_memory,
1272 ),
1273 split,
1274 map_tuple=True,
1275 disable_tqdm=True,
1276 )
1277 if isinstance(datasets, dict):
1278 datasets = DatasetDict(datasets)
File c:\\Users\\cybes\\.conda\\envs\\ECoG\\lib\\site-packages\\datasets\\utils\\py_utils.py:484, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, batched, batch_size, types, disable_tqdm, desc)
482 if batched:
483 data_struct = [data_struct]
--> 484 mapped = function(data_struct)
485 if batched:
486 mapped = mapped[0]
File c:\\Users\\cybes\\.conda\\envs\\ECoG\\lib\\site-packages\\datasets\\builder.py:1296, in DatasetBuilder._build_single_dataset(self, split, run_post_process, verification_mode, in_memory)
1293 split = Split(split)
1295 # Build base dataset
-> 1296 ds = self._as_dataset(
1297 split=split,
1298 in_memory=in_memory,
1299 )
1300 if run_post_process:
1301 for resource_file_name in self._post_processing_resources(split).values():
File c:\\Users\\cybes\\.conda\\envs\\ECoG\\lib\\site-packages\\datasets\\builder.py:1370, in DatasetBuilder._as_dataset(self, split, in_memory)
1368 if self._check_legacy_cache():
1369 dataset_name = self.name
-> 1370 dataset_kwargs = ArrowReader(cache_dir, self.info).read(
1371 name=dataset_name,
1372 instructions=split,
1373 split_infos=self.info.splits.values(),
1374 in_memory=in_memory,
1375 )
1376 fingerprint = self._get_dataset_fingerprint(split)
1377 return Dataset(fingerprint=fingerprint, **dataset_kwargs)
File c:\\Users\\cybes\\.conda\\envs\\ECoG\\lib\\site-packages\\datasets\\arrow_reader.py:256, in BaseReader.read(self, name, instructions, split_infos, in_memory)
254 msg = f'Instruction \"{instructions}\" corresponds to no data!'
255 #msg = f'Instruction \"{self._path}\",\"{name}\",\"{instructions}\",\"{split_infos}\" corresponds to no data!'
--> 256 raise ValueError(msg)
257 return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
ValueError: Instruction \"train\" corresponds to no data!"
}
Environment info
Environment:
python 3.9
windows 11 pro
VScode+jupyter
it seems the bug will happened in all windows system, I tried it in windows8.1, 10, 11 and all of them failed. But it won't happened in the Linux(Ubuntu and Centos7) and Mac (both my virtual and physical machine). I still don't know what the problem is. May be related to the path? I cannot run the split file in my windows server which created in Linux (even I replace the path in the arrow document)....work for it for a week but still cannot fix it .....upset