You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am using this script for training, but it has been stuck. What is the reason? Thank you
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779]
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] *****************************************
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] *****************************************
Traceback (most recent call last):
File "/opt/conda/envs/bamba/bin/torchrun", line 8, in
sys.exit(main())
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
result = agent.run()
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
result = self._invoke_run(role)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 829, in _invoke_run
self._initialize_workers(self._worker_group)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 652, in _initialize_workers
self._rendezvous(worker_group)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 489, in _rendezvous
rdzv_info = spec.rdzv_handler.next_rendezvous()
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 66, in next_rendezvous
self._store = TCPStore( # type: ignore[call-arg]
torch.distributed.DistStoreError: Timed out after 901 seconds waiting for clients. 1/8 clients joined.
The text was updated successfully, but these errors were encountered:
#!/bin/bash
On AWS, the EFA and OFI paths enable NCCL to use optimized networking.
export LD_LIBRARY_PATH=/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda:/usr/local/cuda/targets/x86_64-linux/lib/:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:$LD_LIBRARY_PATH
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
torchrun --nnodes=8 --node_rank=0 --nproc_per_node=8
main_training_mamba.py
--tokenizer_path="/workspace/mnt/xxx/models/Bamba-9B"
--data_path="/workspace/mnt/cm-nfx/datasets/case7/no_preprocess"
--datasets="dataset=algorithmic_corpus,dataset=synthetic_code_snippet,dataset=synthetic_qa"
--weights="1,1,1"
--col_name="text"
--file_type="arrow"
--num_workers=12
--seq_length=4096
--vocab_size=128256
--logical_shards=960
--ckpt_load_path="/workspace/mnt/xxx/models/Bamba-9B"
--ckpt_save_path="/workspace/mnt/xxx/ckpt/bamba-fms"
--sharding_strategy="fsdp"
--batch_size=2
--learning_rate=3e-4
--num_steps=1000
--report_interval=10
--checkpoint_interval=10
--strip_tokens: str = ""
--seed=2023
--bos_token=None
--bol_token=None
--eol_token=None
--eos_token=0 \
I am using this script for training, but it has been stuck. What is the reason? Thank you
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779]
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] *****************************************
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] *****************************************
Traceback (most recent call last):
File "/opt/conda/envs/bamba/bin/torchrun", line 8, in
sys.exit(main())
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
result = agent.run()
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
result = self._invoke_run(role)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 829, in _invoke_run
self._initialize_workers(self._worker_group)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 652, in _initialize_workers
self._rendezvous(worker_group)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 489, in _rendezvous
rdzv_info = spec.rdzv_handler.next_rendezvous()
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 66, in next_rendezvous
self._store = TCPStore( # type: ignore[call-arg]
torch.distributed.DistStoreError: Timed out after 901 seconds waiting for clients. 1/8 clients joined.
The text was updated successfully, but these errors were encountered: