-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresume_multi_gpu.sh
31 lines (27 loc) · 1021 Bytes
/
resume_multi_gpu.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/bin/bash
#SBATCH -t 12:00:00
#SBATCH -N 1
#SBATCH -c 4
#SBATCH --ntasks-per-node=4
#SBATCH --gres=gpu:a100:4
#SBATCH --mem=40G # per node memory
#SBATCH -p multi-gpu
#SBATCH -o ./logs/med-50seg-multi-seg-resume-2.out
#SBATCH -e ./logs/med-50seg-multi-seg-resume-2.err
#SBATCH [email protected]
#SBATCH --mail-type=FAIL
export PATH="/om2/user/sabeen/miniconda/bin:$PATH"
conda init bash
# -u ensures that the output is unbuffered, and written immediately to stdout.
# 24 batch size per A100 GPU
# For multi GPU training
srun python -u scripts/commands/main.py resume-train --logdir='20240204-multi-4gpu-Msegformer\Smed\Ldice\C51\B512\A0'
# srun python -u scripts/commands/main.py resume-train --logdir='20240204-multi-4gpu-Msimple_unet\Smed\Ldice\C51\B352\A0'
# to run:
# sbatch --export=ALL,wandb_description='testrun' jobs/job.sh
# SBATCH -p multi-gpu
# SBATCH --constraint=high-capacity
# SBATCH --gres=gpu:a100:1
# SBATCH --constraint=any-A100
# SBATCH --constraint=high-capacity
# SBATCH --gres=gpu:1