-
Notifications
You must be signed in to change notification settings - Fork 0
/
ecco_ocr_ec_lumi.sh
137 lines (118 loc) · 5.8 KB
/
ecco_ocr_ec_lumi.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
#SBATCH --account=Project_462000587
##SBATCH --time=12:00:00
#SBATCH --time=2:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=56
#SBATCH --mem=0
#SBATCH --partition=dev-g
#SBATCH --gpus-per-task=mi250:8
#SBATCH --exclusive
echo "Slurm job ID: $SLURM_JOB_ID"
echo "Slurm job nodes: $SLURM_JOB_NODELIST"
export NCCL_SOCKET_IFNAME=hsn
export LOCAL_SCRATCH=/tmp
module purge
# module load LUMI partition/container EasyBuild-user
# module load cray-python
# eb PyTorch-2.1.0-rocm-5.6.1-python-3.10-singularity-20231123.eb
# module load PyTorch/2.1.0-rocm-5.6.1-python-3.10-singularity-20231123
module use /appl/local/csc/modulefiles
module load pytorch
# export PYTHONUSERBASE=/projappl/project_2000539/rastasii/ecco-ocr-ec/user-env
# pip install --user --ignore-installed pytorch-lightning
# pip install --user --upgrade pip
# pip install --user --upgrade pytorch-lightning
# pip install --user --ignore-installed lightning_utilities
# cat > $LOCAL_SCRATCH/make_env.sh << "EOF"
# python -m venv $LOCAL_SCRATCH/env
# source $LOCAL_SCRATCH/env/bin/activate
# pip install --upgrade pip setuptools wheel
# # pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
# # pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2
# pip install torch==1.13.1+rocm5.2 torchvision==0.14.1+rocm5.2 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/rocm5.2
# pip install transformers
# pip install pytorch-lightning
# pip install torch_optimizer
# pip install deepspeed
# pip install evaluate
# pip install cer
# pip install jiwer
# pip install tensorboard
# pip install seaborn
# EOF
# srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash $LOCAL_SCRATCH/make_env.sh
# source $LOCAL_SCRATCH/env/bin/activate
# rm -r env_2
# bash make_env.sh
source env/bin/activate
# echo "SIF: $SIF"
# echo "RUNSCRIPTS: $RUNSCRIPTS"
# ls $RUNSCRIPTS
# export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "HIP_PATH: $HIP_PATH"
echo "HSA_PATH: $HSA_PATH"
echo "HIP_ROCCLR_HOME: $HIP_ROCCLR_HOME"
echo "HIP_CLANG_PATH: $HIP_CLANG_PATH"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
export TMPDIR=$LOCAL_SCRATCH
# export PYTORCH_PRETRAINED_BERT_CACHE=$TMPDIR
export TORCH_HOME=/scratch/project_462000587/rastasii/cache
export HF_HOME=/scratch/project_462000587/rastasii/cache
# export HF_DATASETS_CACHE=$TMPDIR
# export TRANSFORMERS_CACHE=/scratch/project_462000347/rastasii/cache
# export OMP_NUM_THREADS=8
export HF_EVALUATE_OFFLINE=1
# nvcc --version
# rocminfo
which python
python -m deepspeed.env_report
pip show transformers
pip show accelerate
NNODES=1
NGPUS=8
TRAIN=$1 # A .jsonl.gz file containing the training samples as {'input', 'output'}.
EVAL=$2 # A .jsonl.gz file containing the evaluation samples, in the same format as the training samples.
OUTDIR=$3 # A directory to which fine-tuning checkpoints are saved.
cat > $TMPDIR/copy_script.sh << "EOF"
#!/bin/bash
echo "Temporary directory: $1"
gzip -dc $2 > $1/train.jsonl
gzip -dc $3 > $1/eval.jsonl
# Copy the evaluation metrics for offline use.
cp -r libs/evaluate/metrics/character .
cp -r libs/evaluate/metrics/wer .
echo "Copied training data to $TMPDIR"
EOF
chmod u+x $TMPDIR/copy_script.sh
sbcast $TMPDIR/copy_script.sh{,}
echo "Copying training data to NVMe."
SECONDS=0
# srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 gzip -dc $TRAIN | sponge $TMPDIR/train.jsonl
# srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 gzip -dc $EVAL | sponge $TMPDIR/eval.jsonl
# srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 ls -la $TMPDIR
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 $TMPDIR/copy_script.sh $TMPDIR $TRAIN $EVAL
duration=$SECONDS
echo "Train data copied to ${TMPDIR} in ${duration} seconds."
wc -l $TMPDIR/train.jsonl
wc -l $TMPDIR/eval.jsonl
# https://docs.csc.fi/support/tutorials/ml-multi/
if [ $# -eq 3 ]
then
# srun --label singularity exec -B "/appl:/appl" -B "$SCRATCH:$LOCAL_SCRATCH" $SIF $RUNSCRIPTS/conda-python-simple ecco_ocr_ec.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --out_dir $OUTDIR
srun --label python ecco_ocr_ec.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --out_dir $OUTDIR
elif [ $# -eq 4 ]
then
# python $4/zero_to_fp32.py $4 $4/pytorch_model.bin
# srun --label python ecco_ocr_ec.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --out_dir $OUTDIR --load_checkpoint $4
# scontrol show hostnames "$SLURM_JOB_NODELIST"
# export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
# python -m accelerate.commands.launch --num_processes $(($NNODES * $NGPUS)) ecco_ocr_ec_eval.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --out_dir $OUTDIR --load_checkpoint $4
# python -m accelerate.commands.launch --num_processes $(($NNODES * $NGPUS)) ocr_ec_inference.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --predictions $TMPDIR/predictions.jsonl --references $TMPDIR/references.jsonl --load_checkpoint $4
# singularity_wrapper exec deepspeed ocr_ec_inference.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --predictions $TMPDIR/predictions.jsonl --references $TMPDIR/references.jsonl --load_checkpoint $4
# python extract_ckpt_from_pl.py --ds_checkpoint $4 --model_path $4
srun --label python ocr_ec_inference.py --nodes $NNODES --gpus $NGPUS --train $TMPDIR/train.jsonl --eval $TMPDIR/eval.jsonl --predictions $TMPDIR/predictions.jsonl --references $TMPDIR/references.jsonl --load_checkpoint $4
python ../ocr-postcorrection-lm/evaluation/eval_metrics.py -p $TMPDIR/predictions.jsonl -r $TMPDIR/references.jsonl -m all
fi