-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_ddp_var_bandwidth_ddp.sh
31 lines (29 loc) · 2.37 KB
/
run_ddp_var_bandwidth_ddp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/bin/bash
master_ip=$1
rank=$2
bsize=$3
dataset_location=$4
log_file=$5
num_workers=${6}
s3_prefix=${7}
echo "$master_ip";
echo "$rank";
echo "$bsize";
echo "$dataset_location";
echo "$log_file";
echo "$num_workers";
echo $s3_prefix;
#./run.sh -arch resnet18 -master-ip tcp://127.0.0.1:2345 -rank 0 -reducer powersgd -bsize 28 -dataset-location /home/ubunut -device cuda:0 -log-file temp -reducer powersgd -reducer-param 2
source activate pytorch_latest_p37
sudo tc qdisc add dev ens3 root tbf rate 20gbit latency 50ms burst 10MB
OMP_NUM_THREADS=8 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=$num_workers --node_rank=$rank --master_addr=$master_ip --master_port=2345 main_bert.py --batch-size $bsize --dataset-location $dataset_location --log-file $log_file --s3-prefix "${s3_prefix}_20gbps" --node_rank $rank --max_seq_length 512
#sudo tc qdisc del dev ens3 root tbf rate 1gbit latency 50ms burst 10MB
#sudo tc qdisc add dev ens3 root tbf rate 2gbit latency 50ms burst 10MB
#OMP_NUM_THREADS=8 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=$num_workers --node_rank=$rank --master_addr=$master_ip --master_port=2345 main_bert.py --batch-size $bsize --dataset-location $dataset_location --log-file $log_file --s3-prefix "${s3_prefix}_2gbps" --node_rank $rank --max_seq_length 512
#sudo tc qdisc del dev ens3 root tbf rate 2gbit latency 50ms burst 10MB
#sudo tc qdisc add dev ens3 root tbf rate 4gbit latency 50ms burst 10MB
#OMP_NUM_THREADS=8 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=$num_workers --node_rank=$rank --master_addr=$master_ip --master_port=2345 main_bert.py --batch-size $bsize --dataset-location $dataset_location --log-file $log_file --s3-prefix "${s3_prefix}_4gbps" --node_rank $rank --max_seq_length 512
#sudo tc qdisc del dev ens3 root tbf rate 4gbit latency 50ms burst 10MB
#sudo tc qdisc add dev ens3 root tbf rate 8gbit latency 50ms burst 10MB
#OMP_NUM_THREADS=8 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=$num_workers --node_rank=$rank --master_addr=$master_ip --master_port=2345 main_bert.py --batch-size $bsize --dataset-location $dataset_location --log-file $log_file --s3-prefix "${s3_prefix}_8gbps" --node_rank $rank --max_seq_length 512
# python main.py --arch $arch --master-ip $2 --rank $3 --reducer $4 --batch-size $5 --dataset-location $6 --device cuda:1 --log-file $8 --reducer $9 --reducer-param $reducer_param