AttributeError: 'Namespace' object has no attribute 'deepspeed_config_dict'. Did you mean: 'deepspeed_config'? && batch = next(self.data_iterator)
hi20240217 opened this issue · comments
hi20240217 commented
1. AttributeError: 'Namespace' object has no attribute 'deepspeed_config_dict'. Did you mean: 'deepspeed_config'?
export MAX_JOBS=8
export NCCL_DEBUG=error
export NCCL_SOCKET_IFNAME=ens8
export NCCL_IB_DISABLE=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
torchrun --nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6000 pretrain_gpt.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 4 \
--num-layers-per-virtual-pipeline-stage 2 --overlap-p2p-communication \
--distributed-backend nccl \
--layernorm-epsilon 1e-6 \
--num-layers 8 \
--hidden-size 2048 \
--ffn-hidden-size 4096 \
--num-attention-heads 8 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 16 \
--train-iters 32 \
--log-interval 2 \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 1e-4 \
--lr-warmup-samples 0 \
--min-lr 1e-6 \
--lr-warmup-fraction 0 \
--lr-decay-iters 1 \
--lr-decay-style cosine \
--lr-warmup-iters 0 \
--clip-grad 1.0 \
--weight-decay 1e-1 \
--fp16 \
--seed 42 \
--vocab-file /home/microsoft-Megatron-DeepSpeed/gpt2-data/gpt2-vocab.json \
--merge-file /home/microsoft-Megatron-DeepSpeed/gpt2-data/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--eval-iters 0 \
--data-path /home/microsoft-Megatron-DeepSpeed/gpt2-data/gpt2_text_document
2.
File "/home/microsoft-Megatron-DeepSpeed/nfs/microsoft-Megatron-DeepSpeed/pretrain_gpt.py", line 466, in <module>
pretrain(train_valid_test_datasets_provider,
File "/usr/local/lib/python3.10/dist-packages/megatron_core-0.2.0-py3.10.egg/megatron/training.py", line 227, in pretrain
iteration = train(forward_step_func,
File "/usr/local/lib/python3.10/dist-packages/megatron_core-0.2.0-py3.10.egg/megatron/training.py", line 1211, in train
train_step(forward_step_func,
File "/usr/local/lib/python3.10/dist-packages/megatron_core-0.2.0-py3.10.egg/megatron/training.py", line 670, in train_step
loss = model[0].train_batch(data_iter=data_iterator)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 378, in train_batch
self._exec_schedule(sched)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 1434, in _exec_schedule
self._exec_instr(**cmd.kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 875, in _exec_load_micro_batch
batch = self._next_batch()
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 691, in _next_batch
batch = next(self.data_iterator)
TypeError: 'list' object is not an iterator
tee ds_config.json <<-'EOF'
{
"train_micro_batch_size_per_gpu": 4,
"train_batch_size": 16,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 0
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOF
export MAX_JOBS=8
export NCCL_DEBUG=error
export NCCL_SOCKET_IFNAME=ens8
export NCCL_IB_DISABLE=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
torchrun --nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6000 pretrain_gpt.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 4 \
--num-layers-per-virtual-pipeline-stage 2 --overlap-p2p-communication \
--distributed-backend nccl \
--layernorm-epsilon 1e-6 \
--num-layers 8 \
--hidden-size 2048 \
--ffn-hidden-size 4096 \
--num-attention-heads 8 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 16 \
--train-iters 32 \
--log-interval 2 \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 1e-4 \
--lr-warmup-samples 0 \
--min-lr 1e-6 \
--lr-warmup-fraction 0 \
--lr-decay-iters 1 \
--lr-decay-style cosine \
--lr-warmup-iters 0 \
--clip-grad 1.0 \
--weight-decay 1e-1 \
--fp16 \
--seed 42 \
--vocab-file /home/microsoft-Megatron-DeepSpeed/gpt2-data/gpt2-vocab.json \
--merge-file /home/microsoft-Megatron-DeepSpeed/gpt2-data/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--eval-iters 0 \
--data-path /home/microsoft-Megatron-DeepSpeed/gpt2-data/gpt2_text_document \
--deepspeed --deepspeed_config ./ds_config.json \
--zero-stage 0
JiangGY commented
I have this question too