Hi, I am running example of fine-tuning language model Llama-2-70b by using deepspeed.
running scripts:
PT_HPU_MAX_COMPOUND_OP_SIZE=10 DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 \
python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lora_clm.py \
--model_name_or_path meta-llama/Llama-2-70b-hf \
--deepspeed llama2_ds_zero3_config.json \
--dataset_name tatsu-lab/alpaca \
--bf16 True \
--output_dir ./lora_out \
--num_train_epochs 2 \
--max_seq_len 2048 \
--per_device_train_batch_size 10 \
--per_device_eval_batch_size 10 \
--gradient_checkpointing \
--evaluation_strategy epoch \
--eval_delay 2 \
--save_strategy no \
--learning_rate 0.0018 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--dataset_concatenation \
--attn_softmax_bf16 True \
--do_train \
--do_eval \
--use_habana \
--use_lazy_mode \
--pipelining_fwd_bwd \
--throughput_warmup_steps 3 \
--lora_rank 4 \
--lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
--validation_split_percentage 4 \
--use_flash_attention True
from document.
But got following failure message:
[2024-04-02 07:50:04,147] [INFO] [checkpointing.py:540:forward] ----Partition Activations False, CPU CHECKPOINTING False
[2024-04-02 07:50:04,147] [INFO] [checkpointing.py:541:forward] ----contiguous Memory Checkpointing False with None total layers
[2024-04-02 07:50:04,147] [INFO] [checkpointing.py:543:forward] ----Synchronization False
[2024-04-02 07:50:04,147] [INFO] [checkpointing.py:544:forward] ----Profiling time in checkpointing False
Traceback (most recent call last):
File "/root/workspace/optimum-habana/examples/language-modeling/run_lora_clm.py", line 744, in <module>
main()
File "/root/workspace/optimum-habana/examples/language-modeling/run_lora_clm.py", line 717, in main
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 509, in train
return inner_training_loop(
File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 915, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 1488, in training_step
self.accelerator.backward(loss)
File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1960, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 2017, in backward
self.allreduce_gradients()
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1934, in allreduce_gradients
self.optimizer.overlapping_partition_gradients_reduce_epilogue()
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 1095, in overlapping_partition_gradients_reduce_epilogue
self.independent_gradient_partition_epilogue()
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 1072, in independent_gradient_partition_epilogue
self.__reduce_and_partition_ipg_grads()
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 1201, in __reduce_and_partition_ipg_grads
grad_partitions = self.__avg_scatter_grads(self.params_in_ipg_bucket)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 1269, in __avg_scatter_grads
grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, self.dp_process_group)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/comm/coalesced_collectives.py", line 120, in reduce_scatter_coalesced
_torch_reduce_scatter_fn(tensor_partition_flat_buffer,
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/comm/coalesced_collectives.py", line 23, in _torch_reduce_scatter_fn
return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor, input_tensor, group=group, async_op=False)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 257, in reduce_scatter_fn
return reduce_scatter_tensor(output_tensor,
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 289, in reduce_scatter_tensor
return cdb.reduce_scatter_tensor(output_tensor=output_tensor,
File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 265, in reduce_scatter_tensor
return self.reduce_scatter_function(output_tensor,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 72, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 3120, in reduce_scatter_tensor
work = group._reduce_scatter_base(output, input, opts)
RuntimeError: No backend type associated with device type cpu
How to solve this problem?