-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
105 lines (90 loc) · 2.88 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Run all commands in one shell
.ONESHELL:
# Default target
.DEFAULT_GOAL := help
.PHONY : help
## help: run 'make help" at commandline
help : Makefile
@sed -n 's/^##//p' $<
.PHONY: list
## list: list all targets in the current make file
list:
@LC_ALL=C $(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$'
# Generic Variables
USR := $(shell whoami | head -c 2)
DT := $(shell date +"%Y%m%d")
# Notes: Since accelerate is not working, I recommend using the following values
# | Model | Batch | CPU | Memory |
# | segformer | 688 | 3 | 96 | (ignore this model going forward)
# | simple_unet | 1632 | 3 | 96 |
# Training parameters
model_name = segformer
num_epochs = 100
# augment = 0
lrs = 0.0001
debug = 0
batch_sizes = 256
nr_of_classes = 50
data_size = med
# aug_flip = 0 1 2 3
log_images = 0
## ddpm-train: train a model from scratch
tl-train-new:
for model in $(model_name); do \
for batch_size in $(batch_sizes); do \
for lr in $(lrs); do \
logdir="/om2/scratch/Sat/sabeen/20240215-grid-M$$model\S$(data_size)\C$(nr_of_classes)\B$$batch_size\LR$$lr\A0"
sbatch --job-name=$$logdir submit_requeue.sh \
model_name=$$model \
nr_of_classes=$(nr_of_classes) \
logdir=$$logdir \
num_epochs=$(num_epochs) \
batch_size=$$batch_size \
lr=$$lr \
debug=$(debug) \
log_images=$(log_images) \
data_size=$(data_size); \
done;
done; \
done;
## ddpm-train: train a model from scratch
tl-train:
for model in $(model_name); do \
for batch_size in $(batch_sizes); do \
for lr in $(lrs); do \
logdir="/om2/scratch/Sat/sabeen/20240214-grid-M$$model\S$(data_size)\C$(nr_of_classes)\B$$batch_size\LR$$lr\A0"
sbatch --job-name=$$logdir submit.sh srun python -u scripts/commands/main.py train \
--model_name $$model \
--nr_of_classes $(nr_of_classes) \
--logdir $$logdir \
--num_epochs $(num_epochs) \
--batch_size $$batch_size \
--lr $$lr \
--debug $(debug) \
--log_images $(log_images) \
--data_size $(data_size); \
done;
done; \
done;
## ddpm-resume: resume training
tl-resume:
for model in $(model_name); do \
for loss in $(loss_type); do \
logdir=test-M$$model\L$$loss\A$(augment)
sbatch --job-name=$$logdir --open-mode=append submit.sh python -u scripts/commands/main.py resume-train \
/space/calico/1/users/Harsha/ddpm-labels/logs/$$logdir; \
done; \
done;
## tl-test: test changes to code using fashion-mnist data
tl-test:
python -u scripts/main.py train \
--model_name segformer \
--logdir mnist \
--num_epochs 10 \
--debug 1\
;
## model-summary: print model summary
model-summary:
python TissueLabeling/models/segformer.py
python TissueLabeling/models/original_unet.py
python TissueLabeling/models/attention_unet.py