From 792f6b99a2b26200ade982c624989b00ce612e04 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Wed, 13 Nov 2024 11:41:30 +0100
Subject: [PATCH 01/21] Hardcode the warmup process and the weight
 initialization

---
 luxonis_train/config/config.py                |   2 +
 luxonis_train/models/luxonis_lightning.py     | 121 +++++++++++++++++-
 .../backbones/efficientrep/efficientrep.py    |  21 +++
 luxonis_train/nodes/blocks/blocks.py          |  12 ++
 .../nodes/heads/efficient_bbox_head.py        |  13 ++
 .../nodes/necks/reppan_neck/reppan_neck.py    |  13 ++
 6 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index 67fdc8f0..7ed0c573 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -328,6 +328,8 @@ class CallbackConfig(BaseModelExtraForbid):
 
 class OptimizerConfig(BaseModelExtraForbid):
     name: str = "Adam"
+    apply_custom_lr: bool = False
+    warmup_epochs: int = 0
     params: Params = {}
 
 
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 011c3983..600885bf 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -1,3 +1,4 @@
+import math
 from collections import defaultdict
 from collections.abc import Mapping
 from logging import getLogger
@@ -5,6 +6,7 @@
 from typing import Literal, cast
 
 import lightning.pytorch as pl
+import numpy as np
 import torch
 from lightning.pytorch.callbacks import ModelCheckpoint, RichModelSummary
 from lightning.pytorch.utilities import rank_zero_only  # type: ignore
@@ -131,6 +133,15 @@ def __init__(
         self._core = _core
 
         self.cfg = cfg
+        ##
+        self.max_stepnum = math.ceil(
+            len(self._core.loaders["train"]) / cfg.trainer.batch_size
+        )
+        self.warmup_stepnum = max(
+            round(cfg.trainer.optimizer.warmup_epochs * self.max_stepnum), 1000
+        )
+        self.step = 0
+        ##
         self.original_in_shapes = input_shapes
         self.image_source = cfg.loader.image_source
         self.dataset_metadata = dataset_metadata or DatasetMetadata()
@@ -857,14 +868,66 @@ def configure_optimizers(
         list[torch.optim.Optimizer],
         list[torch.optim.lr_scheduler.LRScheduler],
     ]:
-        """Configures model optimizers and schedulers."""
+        """Configures model optimizers and schedulers with optional
+        custom learning rates and warm-up logic."""
+
         cfg_optimizer = self.cfg.trainer.optimizer
         cfg_scheduler = self.cfg.trainer.scheduler
 
-        optim_params = cfg_optimizer.params | {
-            "params": filter(lambda p: p.requires_grad, self.parameters()),
-        }
-        optimizer = OPTIMIZERS.get(cfg_optimizer.name)(**optim_params)
+        apply_custom_lr = cfg_optimizer.apply_custom_lr
+
+        if apply_custom_lr:
+            g_bnw, g_w, g_b = [], [], []
+            for v in self.modules():
+                if hasattr(v, "bias") and isinstance(
+                    v.bias, torch.nn.Parameter
+                ):
+                    g_b.append(v.bias)
+                if isinstance(v, torch.nn.BatchNorm2d):
+                    g_bnw.append(v.weight)
+                elif hasattr(v, "weight") and isinstance(
+                    v.weight, torch.nn.Parameter
+                ):
+                    g_w.append(v.weight)
+
+            # Create the optimizer with parameter groups
+            assert cfg_optimizer.name in [
+                "SGD",
+                "Adam",
+            ], "ERROR: unknown optimizer, use SGD or Adam"
+            optimizer = torch.optim.SGD(
+                g_bnw,
+                lr=cfg_optimizer.params["lr"],
+                momentum=cfg_optimizer.params["momentum"],
+                nesterov=True,
+            )
+
+            optimizer.add_param_group(
+                {
+                    "params": g_w,
+                    "weight_decay": cfg_optimizer.params["weight_decay"],
+                }
+            )
+            optimizer.add_param_group({"params": g_b})
+
+            lrf = 0.01
+            self.lf = (
+                lambda x: (
+                    (1 - math.cos(x * math.pi / self.cfg.trainer.epochs)) / 2
+                )
+                * (lrf - 1)
+                + 1
+            )
+            scheduler = torch.optim.lr_scheduler.LambdaLR(
+                optimizer, lr_lambda=self.lf
+            )
+            return [optimizer], [scheduler]
+
+        else:
+            optim_params = cfg_optimizer.params | {
+                "params": filter(lambda p: p.requires_grad, self.parameters()),
+            }
+            optimizer = OPTIMIZERS.get(cfg_optimizer.name)(**optim_params)
 
         def get_scheduler(scheduler_cfg, optimizer):
             scheduler_class = SCHEDULERS.get(
@@ -895,6 +958,54 @@ def get_scheduler(scheduler_cfg, optimizer):
 
         return [optimizer], [scheduler]
 
+    def on_after_backward(self):
+        """Custom logic to adjust learning rates and momentum after
+        loss.backward."""
+        # Call your custom logic here
+        self.custom_logic()
+
+    def custom_logic(self):
+        """Custom logic to adjust learning rates and momentum after
+        loss.backward."""
+
+        # Increment step counter
+        self.step = (
+            self.step % self.max_stepnum
+        )  # Reset step counter after each epoch
+        curr_step = self.step + self.max_stepnum * self.current_epoch
+
+        # Warm-up phase adjustments
+        if curr_step <= self.warmup_stepnum:
+            optimizer = self.optimizers()
+            for k, param in enumerate(optimizer.param_groups):
+                warmup_bias_lr = (
+                    self.cfg.trainer.optimizer.params["warmup_bias_lr"]
+                    if k == 2
+                    else 0.0
+                )
+                param["lr"] = np.interp(
+                    curr_step,
+                    [0, self.warmup_stepnum],
+                    [
+                        warmup_bias_lr,
+                        self.cfg.trainer.optimizer.params["lr"]
+                        * self.lf(self.current_epoch),
+                    ],
+                )
+                if "momentum" in param:
+                    param["momentum"] = np.interp(
+                        curr_step,
+                        [0, self.warmup_stepnum],
+                        [
+                            self.cfg.trainer.optimizer.params[
+                                "warmup_momentum"
+                            ],
+                            self.cfg.trainer.optimizer.params["momentum"],
+                        ],
+                    )
+
+        self.step += 1
+
     def load_checkpoint(self, path: str | Path | None) -> None:
         """Loads checkpoint weights from provided path.
 
diff --git a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
index d094da14..4357722d 100644
--- a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
+++ b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
@@ -125,9 +125,22 @@ def __init__(
             )
         )
 
+        self.initialize_weights()
+
         if download_weights and var.weights_path:
             self.load_checkpoint(var.weights_path)
 
+    def initialize_weights(self):
+        for m in self.modules():
+            t = type(m)
+            if t is nn.Conv2d:
+                pass
+            elif t is nn.BatchNorm2d:
+                m.eps = 1e-3
+                m.momentum = 0.03
+            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+                m.inplace = True
+
     def set_export_mode(self, mode: bool = True) -> None:
         """Reparametrizes instances of L{RepVGGBlock} in the network.
 
@@ -143,6 +156,14 @@ def set_export_mode(self, mode: bool = True) -> None:
                     module.reparametrize()
 
     def forward(self, inputs: Tensor) -> list[Tensor]:
+        # # Lets plot the input
+        # img_plt = inputs[0].cpu().numpy().transpose(1, 2, 0)
+        # # it was normalised with /255.0 so we have to denormalise it
+        # img_plt = img_plt * 255.0
+        # import matplotlib.pyplot as plt
+        # plt.imshow(img_plt.astype(int))
+        # plt.show(block=True)
+
         outputs: list[Tensor] = []
         x = self.repvgg_encoder(inputs)
         for block in self.blocks:
diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py
index 25bea7c5..2ac01805 100644
--- a/luxonis_train/nodes/blocks/blocks.py
+++ b/luxonis_train/nodes/blocks/blocks.py
@@ -56,6 +56,18 @@ def __init__(self, n_classes: int, in_channels: int):
 
         prior_prob = 1e-2
         self._initialize_weights_and_biases(prior_prob)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        for m in self.modules():
+            t = type(m)
+            if t is nn.Conv2d:
+                pass
+            elif t is nn.BatchNorm2d:
+                m.eps = 1e-3
+                m.momentum = 0.03
+            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+                m.inplace = True
 
     def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         out_feature = self.decoder(x)
diff --git a/luxonis_train/nodes/heads/efficient_bbox_head.py b/luxonis_train/nodes/heads/efficient_bbox_head.py
index c8500915..1394c8fa 100644
--- a/luxonis_train/nodes/heads/efficient_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_bbox_head.py
@@ -95,12 +95,25 @@ def __init__(
                 f"output{i+1}_yolov6r2" for i in range(self.n_heads)
             ]
 
+        self.initialize_weights()
+
         if download_weights:
             # TODO: Handle variants of head in a nicer way
             if self.in_channels == [32, 64, 128]:
                 weights_path = "https://github.com/luxonis/luxonis-train/releases/download/v0.1.0-beta/efficientbbox_head_n_coco.ckpt"
                 self.load_checkpoint(weights_path, strict=False)
 
+    def initialize_weights(self):
+        for m in self.modules():
+            t = type(m)
+            if t is nn.Conv2d:
+                pass
+            elif t is nn.BatchNorm2d:
+                m.eps = 1e-3
+                m.momentum = 0.03
+            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+                m.inplace = True
+
     def forward(
         self, inputs: list[Tensor]
     ) -> tuple[list[Tensor], list[Tensor], list[Tensor]]:
diff --git a/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py b/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
index e6b321be..73908e12 100644
--- a/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
+++ b/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
@@ -165,9 +165,22 @@ def __init__(
             out_channels = channels_list_down_blocks[2 * i + 1]
             curr_n_repeats = n_repeats_down_blocks[i]
 
+        self.initialize_weights()
+
         if download_weights and var.weights_path:
             self.load_checkpoint(var.weights_path)
 
+    def initialize_weights(self):
+        for m in self.modules():
+            t = type(m)
+            if t is nn.Conv2d:
+                pass
+            elif t is nn.BatchNorm2d:
+                m.eps = 1e-3
+                m.momentum = 0.03
+            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+                m.inplace = True
+
     def forward(self, inputs: list[Tensor]) -> list[Tensor]:
         x = inputs[-1]
         up_block_outs: list[Tensor] = []

From 08f219ecdfb1923ec064b415c4c031be9f303a31 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Thu, 14 Nov 2024 13:35:38 +0100
Subject: [PATCH 02/21] custom SGD idea

---
 luxonis_train/models/luxonis_lightning.py | 79 ++++++++++++-----------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 600885bf..fd91691e 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -133,15 +133,6 @@ def __init__(
         self._core = _core
 
         self.cfg = cfg
-        ##
-        self.max_stepnum = math.ceil(
-            len(self._core.loaders["train"]) / cfg.trainer.batch_size
-        )
-        self.warmup_stepnum = max(
-            round(cfg.trainer.optimizer.warmup_epochs * self.max_stepnum), 1000
-        )
-        self.step = 0
-        ##
         self.original_in_shapes = input_shapes
         self.image_source = cfg.loader.image_source
         self.dataset_metadata = dataset_metadata or DatasetMetadata()
@@ -877,40 +868,56 @@ def configure_optimizers(
         apply_custom_lr = cfg_optimizer.apply_custom_lr
 
         if apply_custom_lr:
-            g_bnw, g_w, g_b = [], [], []
-            for v in self.modules():
-                if hasattr(v, "bias") and isinstance(
-                    v.bias, torch.nn.Parameter
+            assert cfg_optimizer.name == "SGD", (
+                "Custom learning rates are supported only for SGD optimizer. "
+                f"Got {cfg_optimizer.name}."
+            )
+            self.max_stepnum = math.ceil(
+                len(self._core.loaders["train"]) / self.cfg.trainer.batch_size
+            )
+            self.warmup_stepnum = max(
+                round(
+                    self.cfg.trainer.optimizer.warmup_epochs * self.max_stepnum
+                ),
+                1000,
+            )
+            self.step = 0
+            batch_norm_weights, regular_weights, biases = [], [], []
+            for module in self.modules():
+                if hasattr(module, "bias") and isinstance(
+                    module.bias, torch.nn.Parameter
                 ):
-                    g_b.append(v.bias)
-                if isinstance(v, torch.nn.BatchNorm2d):
-                    g_bnw.append(v.weight)
-                elif hasattr(v, "weight") and isinstance(
-                    v.weight, torch.nn.Parameter
+                    biases.append(module.bias)
+                if isinstance(module, torch.nn.BatchNorm2d):
+                    batch_norm_weights.append(module.weight)
+                elif hasattr(module, "weight") and isinstance(
+                    module.weight, torch.nn.Parameter
                 ):
-                    g_w.append(v.weight)
+                    regular_weights.append(module.weight)
 
-            # Create the optimizer with parameter groups
-            assert cfg_optimizer.name in [
-                "SGD",
-                "Adam",
-            ], "ERROR: unknown optimizer, use SGD or Adam"
             optimizer = torch.optim.SGD(
-                g_bnw,
+                [
+                    {
+                        "params": batch_norm_weights,
+                        "lr": cfg_optimizer.params["lr"],
+                        "momentum": cfg_optimizer.params["momentum"],
+                        "nesterov": True,
+                    },
+                    {
+                        "params": regular_weights,
+                        "weight_decay": cfg_optimizer.params["weight_decay"],
+                    },
+                    {"params": biases},
+                ],
                 lr=cfg_optimizer.params["lr"],
                 momentum=cfg_optimizer.params["momentum"],
-                nesterov=True,
+                nesterov=cfg_optimizer.params["nesterov"],
             )
 
-            optimizer.add_param_group(
-                {
-                    "params": g_w,
-                    "weight_decay": cfg_optimizer.params["weight_decay"],
-                }
+            lrf = (
+                self.cfg.trainer.optimizer.params["lre"]
+                / self.cfg.trainer.optimizer.params["lr"]
             )
-            optimizer.add_param_group({"params": g_b})
-
-            lrf = 0.01
             self.lf = (
                 lambda x: (
                     (1 - math.cos(x * math.pi / self.cfg.trainer.epochs)) / 2
@@ -961,8 +968,8 @@ def get_scheduler(scheduler_cfg, optimizer):
     def on_after_backward(self):
         """Custom logic to adjust learning rates and momentum after
         loss.backward."""
-        # Call your custom logic here
-        self.custom_logic()
+        if self.cfg.trainer.optimizer.apply_custom_lr:
+            self.custom_logic()
 
     def custom_logic(self):
         """Custom logic to adjust learning rates and momentum after

From dfea01d0cf28ccecfe1cc29354ace8cd72a62d2f Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Thu, 14 Nov 2024 13:43:23 +0100
Subject: [PATCH 03/21] moving warmup_epochs to optim params

---
 luxonis_train/models/luxonis_lightning.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index fd91691e..bd363153 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -877,7 +877,8 @@ def configure_optimizers(
             )
             self.warmup_stepnum = max(
                 round(
-                    self.cfg.trainer.optimizer.warmup_epochs * self.max_stepnum
+                    self.cfg.trainer.optimizer.params["warmup_epochs"]
+                    * self.max_stepnum
                 ),
                 1000,
             )

From 58f7f3fdb526d47f3cf6c2e81b941ad99cb9de72 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Thu, 14 Nov 2024 13:47:10 +0100
Subject: [PATCH 04/21] remove warmup_epochs from config.py

---
 luxonis_train/config/config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index 7ed0c573..05480319 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -329,7 +329,6 @@ class CallbackConfig(BaseModelExtraForbid):
 class OptimizerConfig(BaseModelExtraForbid):
     name: str = "Adam"
     apply_custom_lr: bool = False
-    warmup_epochs: int = 0
     params: Params = {}
 
 

From 5c3a3a73df0bf1550039869eb25614fd7a141f44 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Thu, 14 Nov 2024 13:48:32 +0100
Subject: [PATCH 05/21] remove comments

---
 .../nodes/backbones/efficientrep/efficientrep.py          | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
index 4357722d..34e21020 100644
--- a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
+++ b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
@@ -156,14 +156,6 @@ def set_export_mode(self, mode: bool = True) -> None:
                     module.reparametrize()
 
     def forward(self, inputs: Tensor) -> list[Tensor]:
-        # # Lets plot the input
-        # img_plt = inputs[0].cpu().numpy().transpose(1, 2, 0)
-        # # it was normalised with /255.0 so we have to denormalise it
-        # img_plt = img_plt * 255.0
-        # import matplotlib.pyplot as plt
-        # plt.imshow(img_plt.astype(int))
-        # plt.show(block=True)
-
         outputs: list[Tensor] = []
         x = self.repvgg_encoder(inputs)
         for block in self.blocks:

From cefe3a6946dc88f5b75c7c4c92985d0c4a27892c Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Thu, 14 Nov 2024 14:46:29 +0100
Subject: [PATCH 06/21] Custom scheduler and optimizer classes

---
 luxonis_train/config/config.py                |   2 +-
 luxonis_train/models/luxonis_lightning.py     | 125 +++---------------
 luxonis_train/optimizers/custom_optimizers.py |  51 +++++++
 luxonis_train/optimizers/optimizers.py        |   3 +
 luxonis_train/schedulers/custom_schedulers.py |  79 +++++++++++
 luxonis_train/schedulers/schedulers.py        |   3 +
 6 files changed, 159 insertions(+), 104 deletions(-)
 create mode 100644 luxonis_train/optimizers/custom_optimizers.py
 create mode 100644 luxonis_train/schedulers/custom_schedulers.py

diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index 05480319..2a7f9d3b 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -328,7 +328,6 @@ class CallbackConfig(BaseModelExtraForbid):
 
 class OptimizerConfig(BaseModelExtraForbid):
     name: str = "Adam"
-    apply_custom_lr: bool = False
     params: Params = {}
 
 
@@ -356,6 +355,7 @@ class TrainerConfig(BaseModelExtraForbid):
     profiler: Literal["simple", "advanced"] | None = None
     matmul_precision: Literal["medium", "high", "highest"] | None = None
     verbose: bool = True
+    apply_custom_lr: bool = False
 
     seed: int | None = None
     n_validation_batches: PositiveInt | None = None
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index bd363153..2e5f775f 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -6,7 +6,6 @@
 from typing import Literal, cast
 
 import lightning.pytorch as pl
-import numpy as np
 import torch
 from lightning.pytorch.callbacks import ModelCheckpoint, RichModelSummary
 from lightning.pytorch.utilities import rank_zero_only  # type: ignore
@@ -865,70 +864,32 @@ def configure_optimizers(
         cfg_optimizer = self.cfg.trainer.optimizer
         cfg_scheduler = self.cfg.trainer.scheduler
 
-        apply_custom_lr = cfg_optimizer.apply_custom_lr
+        if self.cfg.trainer.apply_custom_lr:
+            assert (
+                cfg_optimizer.name == "TripleLRSGD"
+            ), "Custom learning rate is only supported for TripleLRSGD optimizer."
+            assert (
+                cfg_scheduler.name == "TripleLRScheduler"
+            ), "Custom learning rate is only supported for TripleLRScheduler scheduler."
 
-        if apply_custom_lr:
-            assert cfg_optimizer.name == "SGD", (
-                "Custom learning rates are supported only for SGD optimizer. "
-                f"Got {cfg_optimizer.name}."
-            )
-            self.max_stepnum = math.ceil(
+            max_stepnum = math.ceil(
                 len(self._core.loaders["train"]) / self.cfg.trainer.batch_size
             )
-            self.warmup_stepnum = max(
-                round(
-                    self.cfg.trainer.optimizer.params["warmup_epochs"]
-                    * self.max_stepnum
-                ),
-                1000,
-            )
-            self.step = 0
-            batch_norm_weights, regular_weights, biases = [], [], []
-            for module in self.modules():
-                if hasattr(module, "bias") and isinstance(
-                    module.bias, torch.nn.Parameter
-                ):
-                    biases.append(module.bias)
-                if isinstance(module, torch.nn.BatchNorm2d):
-                    batch_norm_weights.append(module.weight)
-                elif hasattr(module, "weight") and isinstance(
-                    module.weight, torch.nn.Parameter
-                ):
-                    regular_weights.append(module.weight)
-
-            optimizer = torch.optim.SGD(
-                [
-                    {
-                        "params": batch_norm_weights,
-                        "lr": cfg_optimizer.params["lr"],
-                        "momentum": cfg_optimizer.params["momentum"],
-                        "nesterov": True,
-                    },
-                    {
-                        "params": regular_weights,
-                        "weight_decay": cfg_optimizer.params["weight_decay"],
-                    },
-                    {"params": biases},
-                ],
-                lr=cfg_optimizer.params["lr"],
-                momentum=cfg_optimizer.params["momentum"],
-                nesterov=cfg_optimizer.params["nesterov"],
+            custom_optimizer = OPTIMIZERS.get(cfg_optimizer.name)(
+                self, cfg_optimizer.params
             )
+            optimizer = custom_optimizer.create_optimizer()
 
-            lrf = (
-                self.cfg.trainer.optimizer.params["lre"]
-                / self.cfg.trainer.optimizer.params["lr"]
-            )
-            self.lf = (
-                lambda x: (
-                    (1 - math.cos(x * math.pi / self.cfg.trainer.epochs)) / 2
-                )
-                * (lrf - 1)
-                + 1
-            )
-            scheduler = torch.optim.lr_scheduler.LambdaLR(
-                optimizer, lr_lambda=self.lf
+            custom_scheduler = SCHEDULERS.get(cfg_scheduler.name)(
+                optimizer,
+                cfg_scheduler.params,
+                self.cfg.trainer.epochs,
+                max_stepnum,
             )
+            scheduler = custom_scheduler.create_scheduler()
+
+            self.custom_scheduler = custom_scheduler
+
             return [optimizer], [scheduler]
 
         else:
@@ -969,50 +930,8 @@ def get_scheduler(scheduler_cfg, optimizer):
     def on_after_backward(self):
         """Custom logic to adjust learning rates and momentum after
         loss.backward."""
-        if self.cfg.trainer.optimizer.apply_custom_lr:
-            self.custom_logic()
-
-    def custom_logic(self):
-        """Custom logic to adjust learning rates and momentum after
-        loss.backward."""
-
-        # Increment step counter
-        self.step = (
-            self.step % self.max_stepnum
-        )  # Reset step counter after each epoch
-        curr_step = self.step + self.max_stepnum * self.current_epoch
-
-        # Warm-up phase adjustments
-        if curr_step <= self.warmup_stepnum:
-            optimizer = self.optimizers()
-            for k, param in enumerate(optimizer.param_groups):
-                warmup_bias_lr = (
-                    self.cfg.trainer.optimizer.params["warmup_bias_lr"]
-                    if k == 2
-                    else 0.0
-                )
-                param["lr"] = np.interp(
-                    curr_step,
-                    [0, self.warmup_stepnum],
-                    [
-                        warmup_bias_lr,
-                        self.cfg.trainer.optimizer.params["lr"]
-                        * self.lf(self.current_epoch),
-                    ],
-                )
-                if "momentum" in param:
-                    param["momentum"] = np.interp(
-                        curr_step,
-                        [0, self.warmup_stepnum],
-                        [
-                            self.cfg.trainer.optimizer.params[
-                                "warmup_momentum"
-                            ],
-                            self.cfg.trainer.optimizer.params["momentum"],
-                        ],
-                    )
-
-        self.step += 1
+        if self.cfg.trainer.apply_custom_lr:
+            self.custom_scheduler.update_learning_rate(self.current_epoch)
 
     def load_checkpoint(self, path: str | Path | None) -> None:
         """Loads checkpoint weights from provided path.
diff --git a/luxonis_train/optimizers/custom_optimizers.py b/luxonis_train/optimizers/custom_optimizers.py
new file mode 100644
index 00000000..e000343f
--- /dev/null
+++ b/luxonis_train/optimizers/custom_optimizers.py
@@ -0,0 +1,51 @@
+import torch
+
+
+class TripleLRSGD:
+    def __init__(self, model: torch.nn.Module, params: dict) -> None:
+        """TripleLRSGD is a custom optimizer that separates weights into
+        batch norm weights, regular weights, and biases.
+
+        @type model: torch.nn.Module
+        @param model: The model to be used
+        @type params: dict
+        @param params: The parameters to be used for the optimizer
+        """
+        self.model = model
+        self.params = params
+
+    def create_optimizer(self):
+        batch_norm_weights, regular_weights, biases = [], [], []
+
+        for module in self.model.modules():
+            if hasattr(module, "bias") and isinstance(
+                module.bias, torch.nn.Parameter
+            ):
+                biases.append(module.bias)
+            if isinstance(module, torch.nn.BatchNorm2d):
+                batch_norm_weights.append(module.weight)
+            elif hasattr(module, "weight") and isinstance(
+                module.weight, torch.nn.Parameter
+            ):
+                regular_weights.append(module.weight)
+
+        optimizer = torch.optim.SGD(
+            [
+                {
+                    "params": batch_norm_weights,
+                    "lr": self.params["lr"],
+                    "momentum": self.params["momentum"],
+                    "nesterov": self.params["nesterov"],
+                },
+                {
+                    "params": regular_weights,
+                    "weight_decay": self.params["weight_decay"],
+                },
+                {"params": biases},
+            ],
+            lr=self.params["lr"],
+            momentum=self.params["momentum"],
+            nesterov=self.params["nesterov"],
+        )
+
+        return optimizer
diff --git a/luxonis_train/optimizers/optimizers.py b/luxonis_train/optimizers/optimizers.py
index c2a4bf12..43ca80ff 100644
--- a/luxonis_train/optimizers/optimizers.py
+++ b/luxonis_train/optimizers/optimizers.py
@@ -2,6 +2,8 @@
 
 from luxonis_train.utils.registry import OPTIMIZERS
 
+from .custom_optimizers import TripleLRSGD
+
 for optimizer in [
     optim.Adadelta,
     optim.Adagrad,
@@ -15,5 +17,6 @@
     optim.RAdam,
     optim.RMSprop,
     optim.SGD,
+    TripleLRSGD,
 ]:
     OPTIMIZERS.register_module(module=optimizer)
diff --git a/luxonis_train/schedulers/custom_schedulers.py b/luxonis_train/schedulers/custom_schedulers.py
new file mode 100644
index 00000000..12719322
--- /dev/null
+++ b/luxonis_train/schedulers/custom_schedulers.py
@@ -0,0 +1,79 @@
+import math
+
+import numpy as np
+import torch
+
+
+class TripleLRScheduler:
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        params: dict,
+        epochs: int,
+        max_stepnum: int,
+    ) -> None:
+        """TripleLRScheduler is a custom learning rate scheduler that
+        combines a cosine annealing.
+
+        @type optimizer: torch.optim.Optimizer
+        @param optimizer: The optimizer to be used
+        @type parmas: dict
+        @param parmas: The parameters to be used for the scheduler
+        @type epochs: int
+        @param epochs: The number of epochs to train for
+        @type max_stepnum: int
+        @param max_stepnum: The maximum number of steps to train for
+        """
+        self.optimizer = optimizer
+        self.params = params
+        self.max_stepnum = max_stepnum
+        self.warmup_stepnum = max(
+            round(self.params["warmup_epochs"] * self.max_stepnum), 1000
+        )
+        self.step = 0
+        self.lrf = self.params["lre"] / self.optimizer.defaults["lr"]
+        self.lf = (
+            lambda x: ((1 - math.cos(x * math.pi / epochs)) / 2)
+            * (self.lrf - 1)
+            + 1
+        )
+
+    def create_scheduler(self):
+        scheduler = torch.optim.lr_scheduler.LambdaLR(
+            self.optimizer, lr_lambda=self.lf
+        )
+        return scheduler
+
+    def update_learning_rate(self, current_epoch: int) -> None:
+        """Update the learning rate based on the current epoch.
+
+        @type current_epoch: int
+        @param current_epoch: The current epoch
+        """
+        self.step = self.step % self.max_stepnum
+        curr_step = self.step + self.max_stepnum * current_epoch
+
+        if curr_step <= self.warmup_stepnum:
+            for k, param in enumerate(self.optimizer.param_groups):
+                warmup_bias_lr = (
+                    self.params["warmup_bias_lr"] if k == 2 else 0.0
+                )
+                param["lr"] = np.interp(
+                    curr_step,
+                    [0, self.warmup_stepnum],
+                    [
+                        warmup_bias_lr,
+                        self.optimizer.defaults["lr"] * self.lf(current_epoch),
+                    ],
+                )
+                if "momentum" in param:
+                    self.optimizer.defaults["momentum"] = np.interp(
+                        curr_step,
+                        [0, self.warmup_stepnum],
+                        [
+                            self.params["warmup_momentum"],
+                            self.optimizer.defaults["momentum"],
+                        ],
+                    )
+
+        self.step += 1
diff --git a/luxonis_train/schedulers/schedulers.py b/luxonis_train/schedulers/schedulers.py
index 488a7498..12f184e8 100644
--- a/luxonis_train/schedulers/schedulers.py
+++ b/luxonis_train/schedulers/schedulers.py
@@ -2,6 +2,8 @@
 
 from luxonis_train.utils.registry import SCHEDULERS
 
+from .custom_schedulers import TripleLRScheduler
+
 for scheduler in [
     lr_scheduler.LambdaLR,
     lr_scheduler.MultiplicativeLR,
@@ -18,5 +20,6 @@
     lr_scheduler.CyclicLR,
     lr_scheduler.OneCycleLR,
     lr_scheduler.CosineAnnealingWarmRestarts,
+    TripleLRScheduler,
 ]:
     SCHEDULERS.register_module(module=scheduler)

From 00a2bd3f6eb1c398884ec4950d6bebbb072e3503 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Thu, 14 Nov 2024 15:02:05 +0100
Subject: [PATCH 07/21] default params

---
 luxonis_train/optimizers/custom_optimizers.py |  9 ++++++++-
 luxonis_train/schedulers/custom_schedulers.py | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/luxonis_train/optimizers/custom_optimizers.py b/luxonis_train/optimizers/custom_optimizers.py
index e000343f..f8e28bdb 100644
--- a/luxonis_train/optimizers/custom_optimizers.py
+++ b/luxonis_train/optimizers/custom_optimizers.py
@@ -12,7 +12,14 @@ def __init__(self, model: torch.nn.Module, params: dict) -> None:
         @param params: The parameters to be used for the optimizer
         """
         self.model = model
-        self.params = params
+        self.params = {
+            "lr": 0.02,
+            "momentum": 0.937,
+            "weight_decay": 0.0005,
+            "nesterov": True,
+        }
+        if params:
+            self.params.update(params)
 
     def create_optimizer(self):
         batch_norm_weights, regular_weights, biases = [], [], []
diff --git a/luxonis_train/schedulers/custom_schedulers.py b/luxonis_train/schedulers/custom_schedulers.py
index 12719322..f0736b06 100644
--- a/luxonis_train/schedulers/custom_schedulers.py
+++ b/luxonis_train/schedulers/custom_schedulers.py
@@ -24,8 +24,19 @@ def __init__(
         @type max_stepnum: int
         @param max_stepnum: The maximum number of steps to train for
         """
+        if optimizer.__class__.__name__ != "SGD":
+            raise ValueError(
+                "TripleLRScheduler can only be used with the 'SGD' optimizer."
+            )
         self.optimizer = optimizer
-        self.params = params
+        self.params = {
+            "warmup_epochs": 3,
+            "warmup_bias_lr": 0.1,
+            "warmup_momentum": 0.8,
+            "lre": 0.0002,
+        }
+        if params:
+            self.params.update(params)
         self.max_stepnum = max_stepnum
         self.warmup_stepnum = max(
             round(self.params["warmup_epochs"] * self.max_stepnum), 1000

From 147c1eef54aea5dbae21f883fc66cb71c35271b0 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Fri, 15 Nov 2024 13:03:30 +0100
Subject: [PATCH 08/21] code refactor; added training strategies

---
 luxonis_train/__init__.py                     |   1 +
 luxonis_train/callbacks/__init__.py           |   3 +
 luxonis_train/callbacks/training_manager.py   |  28 +++
 luxonis_train/config/config.py                |   7 +-
 luxonis_train/models/luxonis_lightning.py     |  69 +++----
 .../backbones/efficientrep/efficientrep.py    |  11 +-
 luxonis_train/nodes/blocks/blocks.py          |  11 +-
 .../nodes/heads/efficient_bbox_head.py        |  11 +-
 .../nodes/necks/reppan_neck/reppan_neck.py    |  11 +-
 luxonis_train/optimizers/custom_optimizers.py |  58 ------
 luxonis_train/optimizers/optimizers.py        |   3 -
 luxonis_train/schedulers/custom_schedulers.py |  90 ---------
 luxonis_train/schedulers/schedulers.py        |   3 -
 luxonis_train/strategies/__init__.py          |   5 +
 luxonis_train/strategies/base_strategy.py     |  27 +++
 luxonis_train/strategies/triple_lr_sgd.py     | 172 ++++++++++++++++++
 luxonis_train/utils/registry.py               |   5 +
 17 files changed, 297 insertions(+), 218 deletions(-)
 create mode 100644 luxonis_train/callbacks/training_manager.py
 delete mode 100644 luxonis_train/optimizers/custom_optimizers.py
 delete mode 100644 luxonis_train/schedulers/custom_schedulers.py
 create mode 100644 luxonis_train/strategies/__init__.py
 create mode 100644 luxonis_train/strategies/base_strategy.py
 create mode 100644 luxonis_train/strategies/triple_lr_sgd.py

diff --git a/luxonis_train/__init__.py b/luxonis_train/__init__.py
index ac6e38a1..e9651769 100644
--- a/luxonis_train/__init__.py
+++ b/luxonis_train/__init__.py
@@ -10,6 +10,7 @@
     from .nodes import *
     from .optimizers import *
     from .schedulers import *
+    from .strategies import *
     from .utils import *
 except ImportError as e:
     warnings.warn(
diff --git a/luxonis_train/callbacks/__init__.py b/luxonis_train/callbacks/__init__.py
index a3cf907c..7bea71a9 100644
--- a/luxonis_train/callbacks/__init__.py
+++ b/luxonis_train/callbacks/__init__.py
@@ -25,6 +25,7 @@
 from .metadata_logger import MetadataLogger
 from .module_freezer import ModuleFreezer
 from .test_on_train_end import TestOnTrainEnd
+from .training_manager import TrainingManager
 from .upload_checkpoint import UploadCheckpoint
 
 CALLBACKS.register_module(module=EarlyStopping)
@@ -38,6 +39,7 @@
 CALLBACKS.register_module(module=ModelPruning)
 CALLBACKS.register_module(module=GradCamCallback)
 CALLBACKS.register_module(module=EMACallback)
+CALLBACKS.register_module(module=TrainingManager)
 
 
 __all__ = [
@@ -53,4 +55,5 @@
     "GPUStatsMonitor",
     "GradCamCallback",
     "EMACallback",
+    "TrainingManager",
 ]
diff --git a/luxonis_train/callbacks/training_manager.py b/luxonis_train/callbacks/training_manager.py
new file mode 100644
index 00000000..9131fa84
--- /dev/null
+++ b/luxonis_train/callbacks/training_manager.py
@@ -0,0 +1,28 @@
+import pytorch_lightning as pl
+
+from luxonis_train.strategies.base_strategy import BaseTrainingStrategy
+
+
+class TrainingManager(pl.Callback):
+    def __init__(self, strategy: BaseTrainingStrategy | None = None):
+        """Training manager callback that updates the parameters of the
+        training strategy.
+
+        @type strategy: BaseTrainingStrategy
+        @param strategy: The strategy to be used.
+        """
+        self.strategy = strategy
+
+    def on_after_backward(
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule
+    ):
+        """PyTorch Lightning hook that is called after the backward
+        pass.
+
+        @type trainer: pl.Trainer
+        @param trainer: The trainer object.
+        @type pl_module: pl.LightningModule
+        @param pl_module: The pl_module object.
+        """
+        if self.strategy is not None:
+            self.strategy.update_parameters(pl_module)
diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index 2a7f9d3b..144cbd6b 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -336,6 +336,11 @@ class SchedulerConfig(BaseModelExtraForbid):
     params: Params = {}
 
 
+class TrainingStrategyConfig(BaseModelExtraForbid):
+    name: str = "TripleLRSGDStrategy"
+    params: Params = {}
+
+
 class TrainerConfig(BaseModelExtraForbid):
     preprocessing: PreprocessingConfig = PreprocessingConfig()
     use_rich_progress_bar: bool = True
@@ -355,7 +360,6 @@ class TrainerConfig(BaseModelExtraForbid):
     profiler: Literal["simple", "advanced"] | None = None
     matmul_precision: Literal["medium", "high", "highest"] | None = None
     verbose: bool = True
-    apply_custom_lr: bool = False
 
     seed: int | None = None
     n_validation_batches: PositiveInt | None = None
@@ -383,6 +387,7 @@ class TrainerConfig(BaseModelExtraForbid):
 
     optimizer: OptimizerConfig = OptimizerConfig()
     scheduler: SchedulerConfig = SchedulerConfig()
+    training_strategy: TrainingStrategyConfig = TrainingStrategyConfig()
 
     @model_validator(mode="after")
     def validate_deterministic(self) -> Self:
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 2e5f775f..7a7608a7 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -1,4 +1,3 @@
-import math
 from collections import defaultdict
 from collections.abc import Mapping
 from logging import getLogger
@@ -26,7 +25,11 @@
     combine_visualizations,
     get_denormalized_images,
 )
-from luxonis_train.callbacks import BaseLuxonisProgressBar, ModuleFreezer
+from luxonis_train.callbacks import (
+    BaseLuxonisProgressBar,
+    ModuleFreezer,
+    TrainingManager,
+)
 from luxonis_train.config import AttachedModuleConfig, Config
 from luxonis_train.nodes import BaseNode
 from luxonis_train.utils import (
@@ -43,6 +46,7 @@
     CALLBACKS,
     OPTIMIZERS,
     SCHEDULERS,
+    STRATEGIES,
     Registry,
 )
 
@@ -269,6 +273,16 @@ def __init__(
 
         self.load_checkpoint(self.cfg.model.weights)
 
+        if self.cfg.trainer.training_strategy.params:
+            self.training_strategy = STRATEGIES.get(
+                self.cfg.trainer.training_strategy.name
+            )(
+                pl_module=self,
+                params=self.cfg.trainer.training_strategy.params,
+            )
+        else:
+            self.training_strategy = None
+
     @property
     def core(self) -> "luxonis_train.core.LuxonisModel":
         """Returns the core model."""
@@ -850,6 +864,9 @@ def configure_callbacks(self) -> list[pl.Callback]:
                     CALLBACKS.get(callback.name)(**callback.params)
                 )
 
+        if self.training_strategy is not None:
+            callbacks.append(TrainingManager(strategy=self.training_strategy))
+
         return callbacks
 
     def configure_optimizers(
@@ -858,45 +875,17 @@ def configure_optimizers(
         list[torch.optim.Optimizer],
         list[torch.optim.lr_scheduler.LRScheduler],
     ]:
-        """Configures model optimizers and schedulers with optional
-        custom learning rates and warm-up logic."""
+        """Configures model optimizers and schedulers."""
+        if self.training_strategy is not None:
+            return self.training_strategy.configure_optimizers()
 
         cfg_optimizer = self.cfg.trainer.optimizer
         cfg_scheduler = self.cfg.trainer.scheduler
 
-        if self.cfg.trainer.apply_custom_lr:
-            assert (
-                cfg_optimizer.name == "TripleLRSGD"
-            ), "Custom learning rate is only supported for TripleLRSGD optimizer."
-            assert (
-                cfg_scheduler.name == "TripleLRScheduler"
-            ), "Custom learning rate is only supported for TripleLRScheduler scheduler."
-
-            max_stepnum = math.ceil(
-                len(self._core.loaders["train"]) / self.cfg.trainer.batch_size
-            )
-            custom_optimizer = OPTIMIZERS.get(cfg_optimizer.name)(
-                self, cfg_optimizer.params
-            )
-            optimizer = custom_optimizer.create_optimizer()
-
-            custom_scheduler = SCHEDULERS.get(cfg_scheduler.name)(
-                optimizer,
-                cfg_scheduler.params,
-                self.cfg.trainer.epochs,
-                max_stepnum,
-            )
-            scheduler = custom_scheduler.create_scheduler()
-
-            self.custom_scheduler = custom_scheduler
-
-            return [optimizer], [scheduler]
-
-        else:
-            optim_params = cfg_optimizer.params | {
-                "params": filter(lambda p: p.requires_grad, self.parameters()),
-            }
-            optimizer = OPTIMIZERS.get(cfg_optimizer.name)(**optim_params)
+        optim_params = cfg_optimizer.params | {
+            "params": filter(lambda p: p.requires_grad, self.parameters()),
+        }
+        optimizer = OPTIMIZERS.get(cfg_optimizer.name)(**optim_params)
 
         def get_scheduler(scheduler_cfg, optimizer):
             scheduler_class = SCHEDULERS.get(
@@ -927,12 +916,6 @@ def get_scheduler(scheduler_cfg, optimizer):
 
         return [optimizer], [scheduler]
 
-    def on_after_backward(self):
-        """Custom logic to adjust learning rates and momentum after
-        loss.backward."""
-        if self.cfg.trainer.apply_custom_lr:
-            self.custom_scheduler.update_learning_rate(self.current_epoch)
-
     def load_checkpoint(self, path: str | Path | None) -> None:
         """Loads checkpoint weights from provided path.
 
diff --git a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
index 34e21020..121ac1bc 100644
--- a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
+++ b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
@@ -132,13 +132,14 @@ def __init__(
 
     def initialize_weights(self):
         for m in self.modules():
-            t = type(m)
-            if t is nn.Conv2d:
+            if isinstance(m, nn.Conv2d):
                 pass
-            elif t is nn.BatchNorm2d:
-                m.eps = 1e-3
+            elif isinstance(m, nn.BatchNorm2d):
+                m.eps = 0.001
                 m.momentum = 0.03
-            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+            elif isinstance(
+                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU)
+            ):
                 m.inplace = True
 
     def set_export_mode(self, mode: bool = True) -> None:
diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py
index 2ac01805..9d63853f 100644
--- a/luxonis_train/nodes/blocks/blocks.py
+++ b/luxonis_train/nodes/blocks/blocks.py
@@ -60,13 +60,14 @@ def __init__(self, n_classes: int, in_channels: int):
 
     def initialize_weights(self):
         for m in self.modules():
-            t = type(m)
-            if t is nn.Conv2d:
+            if isinstance(m, nn.Conv2d):
                 pass
-            elif t is nn.BatchNorm2d:
-                m.eps = 1e-3
+            elif isinstance(m, nn.BatchNorm2d):
+                m.eps = 0.001
                 m.momentum = 0.03
-            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+            elif isinstance(
+                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU)
+            ):
                 m.inplace = True
 
     def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
diff --git a/luxonis_train/nodes/heads/efficient_bbox_head.py b/luxonis_train/nodes/heads/efficient_bbox_head.py
index 1394c8fa..0081c6ce 100644
--- a/luxonis_train/nodes/heads/efficient_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_bbox_head.py
@@ -105,13 +105,14 @@ def __init__(
 
     def initialize_weights(self):
         for m in self.modules():
-            t = type(m)
-            if t is nn.Conv2d:
+            if isinstance(m, nn.Conv2d):
                 pass
-            elif t is nn.BatchNorm2d:
-                m.eps = 1e-3
+            elif isinstance(m, nn.BatchNorm2d):
+                m.eps = 0.001
                 m.momentum = 0.03
-            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+            elif isinstance(
+                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU)
+            ):
                 m.inplace = True
 
     def forward(
diff --git a/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py b/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
index 73908e12..9d02ddcf 100644
--- a/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
+++ b/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
@@ -172,13 +172,14 @@ def __init__(
 
     def initialize_weights(self):
         for m in self.modules():
-            t = type(m)
-            if t is nn.Conv2d:
+            if isinstance(m, nn.Conv2d):
                 pass
-            elif t is nn.BatchNorm2d:
-                m.eps = 1e-3
+            elif isinstance(m, nn.BatchNorm2d):
+                m.eps = 0.001
                 m.momentum = 0.03
-            elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+            elif isinstance(
+                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU)
+            ):
                 m.inplace = True
 
     def forward(self, inputs: list[Tensor]) -> list[Tensor]:
diff --git a/luxonis_train/optimizers/custom_optimizers.py b/luxonis_train/optimizers/custom_optimizers.py
deleted file mode 100644
index f8e28bdb..00000000
--- a/luxonis_train/optimizers/custom_optimizers.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch
-
-
-class TripleLRSGD:
-    def __init__(self, model: torch.nn.Module, params: dict) -> None:
-        """TripleLRSGD is a custom optimizer that separates weights into
-        batch norm weights, regular weights, and biases.
-
-        @type model: torch.nn.Module
-        @param model: The model to be used
-        @type params: dict
-        @param params: The parameters to be used for the optimizer
-        """
-        self.model = model
-        self.params = {
-            "lr": 0.02,
-            "momentum": 0.937,
-            "weight_decay": 0.0005,
-            "nesterov": True,
-        }
-        if params:
-            self.params.update(params)
-
-    def create_optimizer(self):
-        batch_norm_weights, regular_weights, biases = [], [], []
-
-        for module in self.model.modules():
-            if hasattr(module, "bias") and isinstance(
-                module.bias, torch.nn.Parameter
-            ):
-                biases.append(module.bias)
-            if isinstance(module, torch.nn.BatchNorm2d):
-                batch_norm_weights.append(module.weight)
-            elif hasattr(module, "weight") and isinstance(
-                module.weight, torch.nn.Parameter
-            ):
-                regular_weights.append(module.weight)
-
-        optimizer = torch.optim.SGD(
-            [
-                {
-                    "params": batch_norm_weights,
-                    "lr": self.params["lr"],
-                    "momentum": self.params["momentum"],
-                    "nesterov": self.params["nesterov"],
-                },
-                {
-                    "params": regular_weights,
-                    "weight_decay": self.params["weight_decay"],
-                },
-                {"params": biases},
-            ],
-            lr=self.params["lr"],
-            momentum=self.params["momentum"],
-            nesterov=self.params["nesterov"],
-        )
-
-        return optimizer
diff --git a/luxonis_train/optimizers/optimizers.py b/luxonis_train/optimizers/optimizers.py
index 43ca80ff..c2a4bf12 100644
--- a/luxonis_train/optimizers/optimizers.py
+++ b/luxonis_train/optimizers/optimizers.py
@@ -2,8 +2,6 @@
 
 from luxonis_train.utils.registry import OPTIMIZERS
 
-from .custom_optimizers import TripleLRSGD
-
 for optimizer in [
     optim.Adadelta,
     optim.Adagrad,
@@ -17,6 +15,5 @@
     optim.RAdam,
     optim.RMSprop,
     optim.SGD,
-    TripleLRSGD,
 ]:
     OPTIMIZERS.register_module(module=optimizer)
diff --git a/luxonis_train/schedulers/custom_schedulers.py b/luxonis_train/schedulers/custom_schedulers.py
deleted file mode 100644
index f0736b06..00000000
--- a/luxonis_train/schedulers/custom_schedulers.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import math
-
-import numpy as np
-import torch
-
-
-class TripleLRScheduler:
-    def __init__(
-        self,
-        optimizer: torch.optim.Optimizer,
-        params: dict,
-        epochs: int,
-        max_stepnum: int,
-    ) -> None:
-        """TripleLRScheduler is a custom learning rate scheduler that
-        combines a cosine annealing.
-
-        @type optimizer: torch.optim.Optimizer
-        @param optimizer: The optimizer to be used
-        @type parmas: dict
-        @param parmas: The parameters to be used for the scheduler
-        @type epochs: int
-        @param epochs: The number of epochs to train for
-        @type max_stepnum: int
-        @param max_stepnum: The maximum number of steps to train for
-        """
-        if optimizer.__class__.__name__ != "SGD":
-            raise ValueError(
-                "TripleLRScheduler can only be used with the 'SGD' optimizer."
-            )
-        self.optimizer = optimizer
-        self.params = {
-            "warmup_epochs": 3,
-            "warmup_bias_lr": 0.1,
-            "warmup_momentum": 0.8,
-            "lre": 0.0002,
-        }
-        if params:
-            self.params.update(params)
-        self.max_stepnum = max_stepnum
-        self.warmup_stepnum = max(
-            round(self.params["warmup_epochs"] * self.max_stepnum), 1000
-        )
-        self.step = 0
-        self.lrf = self.params["lre"] / self.optimizer.defaults["lr"]
-        self.lf = (
-            lambda x: ((1 - math.cos(x * math.pi / epochs)) / 2)
-            * (self.lrf - 1)
-            + 1
-        )
-
-    def create_scheduler(self):
-        scheduler = torch.optim.lr_scheduler.LambdaLR(
-            self.optimizer, lr_lambda=self.lf
-        )
-        return scheduler
-
-    def update_learning_rate(self, current_epoch: int) -> None:
-        """Update the learning rate based on the current epoch.
-
-        @type current_epoch: int
-        @param current_epoch: The current epoch
-        """
-        self.step = self.step % self.max_stepnum
-        curr_step = self.step + self.max_stepnum * current_epoch
-
-        if curr_step <= self.warmup_stepnum:
-            for k, param in enumerate(self.optimizer.param_groups):
-                warmup_bias_lr = (
-                    self.params["warmup_bias_lr"] if k == 2 else 0.0
-                )
-                param["lr"] = np.interp(
-                    curr_step,
-                    [0, self.warmup_stepnum],
-                    [
-                        warmup_bias_lr,
-                        self.optimizer.defaults["lr"] * self.lf(current_epoch),
-                    ],
-                )
-                if "momentum" in param:
-                    self.optimizer.defaults["momentum"] = np.interp(
-                        curr_step,
-                        [0, self.warmup_stepnum],
-                        [
-                            self.params["warmup_momentum"],
-                            self.optimizer.defaults["momentum"],
-                        ],
-                    )
-
-        self.step += 1
diff --git a/luxonis_train/schedulers/schedulers.py b/luxonis_train/schedulers/schedulers.py
index 12f184e8..488a7498 100644
--- a/luxonis_train/schedulers/schedulers.py
+++ b/luxonis_train/schedulers/schedulers.py
@@ -2,8 +2,6 @@
 
 from luxonis_train.utils.registry import SCHEDULERS
 
-from .custom_schedulers import TripleLRScheduler
-
 for scheduler in [
     lr_scheduler.LambdaLR,
     lr_scheduler.MultiplicativeLR,
@@ -20,6 +18,5 @@
     lr_scheduler.CyclicLR,
     lr_scheduler.OneCycleLR,
     lr_scheduler.CosineAnnealingWarmRestarts,
-    TripleLRScheduler,
 ]:
     SCHEDULERS.register_module(module=scheduler)
diff --git a/luxonis_train/strategies/__init__.py b/luxonis_train/strategies/__init__.py
new file mode 100644
index 00000000..5e3b5321
--- /dev/null
+++ b/luxonis_train/strategies/__init__.py
@@ -0,0 +1,5 @@
+from .triple_lr_sgd import TripleLRScheduler
+
+__all__ = [
+    "TripleLRScheduler",
+]
diff --git a/luxonis_train/strategies/base_strategy.py b/luxonis_train/strategies/base_strategy.py
new file mode 100644
index 00000000..caa952e6
--- /dev/null
+++ b/luxonis_train/strategies/base_strategy.py
@@ -0,0 +1,27 @@
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+import pytorch_lightning as pl
+from luxonis_ml.utils.registry import AutoRegisterMeta
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+from luxonis_train.utils.registry import STRATEGIES
+
+
+class BaseTrainingStrategy(
+    ABC,
+    metaclass=AutoRegisterMeta,
+    register=False,
+    registry=STRATEGIES,
+):
+    def __init__(self, pl_module: pl.LightningModule):
+        self.pl_module = pl_module
+
+    @abstractmethod
+    def configure_optimizers(self) -> Tuple[_LRScheduler, Optimizer]:
+        pass
+
+    @abstractmethod
+    def update_parameters(self, *args, **kwargs):
+        pass
diff --git a/luxonis_train/strategies/triple_lr_sgd.py b/luxonis_train/strategies/triple_lr_sgd.py
new file mode 100644
index 00000000..b04ddea5
--- /dev/null
+++ b/luxonis_train/strategies/triple_lr_sgd.py
@@ -0,0 +1,172 @@
+# strategies/triple_lr_sgd.py
+import math
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch.optim import SGD
+from torch.optim.lr_scheduler import LambdaLR
+from torch.optim.optimizer import Optimizer
+
+from .base_strategy import BaseTrainingStrategy
+
+
+class TripleLRScheduler:
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        params: dict,
+        epochs: int,
+        max_stepnum: int,
+    ) -> None:
+        """TripleLRScheduler scheduler.
+
+        @type optimizer: torch.optim.Optimizer
+        @param optimizer: The optimizer to be used.
+        @type params: dict
+        @param params: The parameters for the scheduler.
+        @type epochs: int
+        @param epochs: The number of epochs to train for.
+        @type max_stepnum: int
+        @param max_stepnum: The maximum number of steps to train for.
+        """
+        self.optimizer = optimizer
+        self.params = {
+            "warmup_epochs": 3,
+            "warmup_bias_lr": 0.1,
+            "warmup_momentum": 0.8,
+            "lre": 0.0002,
+        }
+        if params:
+            self.params.update(params)
+        self.max_stepnum = max_stepnum
+        self.warmup_stepnum = max(
+            round(self.params["warmup_epochs"] * self.max_stepnum), 1000
+        )
+        self.step = 0
+        self.lrf = self.params["lre"] / self.optimizer.defaults["lr"]
+        self.lf = (
+            lambda x: ((1 - math.cos(x * math.pi / epochs)) / 2)
+            * (self.lrf - 1)
+            + 1
+        )
+
+    def create_scheduler(self):
+        scheduler = LambdaLR(self.optimizer, lr_lambda=self.lf)
+        return scheduler
+
+    def update_learning_rate(self, current_epoch: int) -> None:
+        self.step = self.step % self.max_stepnum
+        curr_step = self.step + self.max_stepnum * current_epoch
+
+        if curr_step <= self.warmup_stepnum:
+            for k, param in enumerate(self.optimizer.param_groups):
+                warmup_bias_lr = (
+                    self.params["warmup_bias_lr"] if k == 2 else 0.0
+                )
+                param["lr"] = np.interp(
+                    curr_step,
+                    [0, self.warmup_stepnum],
+                    [
+                        warmup_bias_lr,
+                        self.optimizer.defaults["lr"] * self.lf(current_epoch),
+                    ],
+                )
+                if "momentum" in param:
+                    self.optimizer.defaults["momentum"] = np.interp(
+                        curr_step,
+                        [0, self.warmup_stepnum],
+                        [
+                            self.params["warmup_momentum"],
+                            self.optimizer.defaults["momentum"],
+                        ],
+                    )
+        self.step += 1
+
+
+class TripleLRSGD:
+    def __init__(self, model: torch.nn.Module, params: dict) -> None:
+        """TripleLRSGD optimizer.
+
+        @type model: torch.nn.Module
+        @param model: The model to be used.
+        @type params: dict
+        @param params: The parameters for the optimizer.
+        """
+        self.model = model
+        self.params = {
+            "lr": 0.02,
+            "momentum": 0.937,
+            "weight_decay": 0.0005,
+            "nesterov": True,
+        }
+        if params:
+            self.params.update(params)
+
+    def create_optimizer(self):
+        batch_norm_weights, regular_weights, biases = [], [], []
+
+        for module in self.model.modules():
+            if hasattr(module, "bias") and isinstance(
+                module.bias, torch.nn.Parameter
+            ):
+                biases.append(module.bias)
+            if isinstance(module, torch.nn.BatchNorm2d):
+                batch_norm_weights.append(module.weight)
+            elif hasattr(module, "weight") and isinstance(
+                module.weight, torch.nn.Parameter
+            ):
+                regular_weights.append(module.weight)
+
+        optimizer = SGD(
+            [
+                {
+                    "params": batch_norm_weights,
+                    "lr": self.params["lr"],
+                    "momentum": self.params["momentum"],
+                    "nesterov": self.params["nesterov"],
+                },
+                {
+                    "params": regular_weights,
+                    "weight_decay": self.params["weight_decay"],
+                },
+                {"params": biases},
+            ],
+            lr=self.params["lr"],
+            momentum=self.params["momentum"],
+            nesterov=self.params["nesterov"],
+        )
+
+        return optimizer
+
+
+class TripleLRSGDStrategy(BaseTrainingStrategy):
+    def __init__(self, pl_module: pl.LightningModule, params: dict):
+        """TripleLRSGD strategy.
+
+        @type pl_module: pl.LightningModule
+        @param pl_module: The pl_module to be used.
+        @type params: dict
+        @param params: The parameters for the strategy.
+        """
+        super().__init__(pl_module)
+        self.model = pl_module
+        self.params = params
+        self.cfg = self.model.cfg
+
+        max_stepnum = math.ceil(
+            len(self.model._core.loaders["train"])
+            / self.cfg.trainer.batch_size
+        )
+
+        self.optimizer = TripleLRSGD(self.model, params).create_optimizer()
+        self.scheduler = TripleLRScheduler(
+            self.optimizer, params, self.cfg.trainer.epochs, max_stepnum
+        )
+
+    def configure_optimizers(self) -> tuple[list[Optimizer], list[LambdaLR]]:
+        return [self.optimizer], [self.scheduler.create_scheduler()]
+
+    def update_parameters(self, *args, **kwargs):
+        current_epoch = self.model.current_epoch
+        self.scheduler.update_learning_rate(current_epoch)
diff --git a/luxonis_train/utils/registry.py b/luxonis_train/utils/registry.py
index 8044f13c..4f413c7a 100644
--- a/luxonis_train/utils/registry.py
+++ b/luxonis_train/utils/registry.py
@@ -35,6 +35,11 @@
 SCHEDULERS: Registry[type[LRScheduler]] = Registry(name="schedulers")
 """Registry for all schedulers."""
 
+STRATEGIES: Registry[type["lt.strategies.BaseTrainingStrategy"]] = Registry(
+    name="strategies"
+)
+"""Registry for all strategies."""
+
 VISUALIZERS: Registry[type["lt.visualizers.BaseVisualizer"]] = Registry(
     "visualizers"
 )

From 6e0c888c038dd0627b602cf26919df939e75c4a3 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Fri, 15 Nov 2024 13:18:26 +0100
Subject: [PATCH 09/21] use self.core

---
 luxonis_train/models/luxonis_lightning.py | 2 +-
 luxonis_train/strategies/triple_lr_sgd.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 7a7608a7..896c4410 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -865,7 +865,7 @@ def configure_callbacks(self) -> list[pl.Callback]:
                 )
 
         if self.training_strategy is not None:
-            callbacks.append(TrainingManager(strategy=self.training_strategy))
+            callbacks.append(TrainingManager(strategy=self.training_strategy))  # type: ignore
 
         return callbacks
 
diff --git a/luxonis_train/strategies/triple_lr_sgd.py b/luxonis_train/strategies/triple_lr_sgd.py
index b04ddea5..33f7dfe3 100644
--- a/luxonis_train/strategies/triple_lr_sgd.py
+++ b/luxonis_train/strategies/triple_lr_sgd.py
@@ -155,8 +155,7 @@ def __init__(self, pl_module: pl.LightningModule, params: dict):
         self.cfg = self.model.cfg
 
         max_stepnum = math.ceil(
-            len(self.model._core.loaders["train"])
-            / self.cfg.trainer.batch_size
+            len(self.model.core.loaders["train"]) / self.cfg.trainer.batch_size
         )
 
         self.optimizer = TripleLRSGD(self.model, params).create_optimizer()

From c0bdbfc30dd54f640a29c4743cf87abf6118e9cb Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Fri, 15 Nov 2024 14:12:31 +0100
Subject: [PATCH 10/21] type-check fix

---
 luxonis_train/strategies/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/luxonis_train/strategies/__init__.py b/luxonis_train/strategies/__init__.py
index 5e3b5321..b83d7190 100644
--- a/luxonis_train/strategies/__init__.py
+++ b/luxonis_train/strategies/__init__.py
@@ -1,5 +1,7 @@
+from .base_strategy import BaseTrainingStrategy
 from .triple_lr_sgd import TripleLRScheduler
 
 __all__ = [
     "TripleLRScheduler",
+    "BaseTrainingStrategy",
 ]

From 0c3bbbe44d6a918297f9c264d5cdddeced43e825 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Fri, 15 Nov 2024 14:31:30 +0100
Subject: [PATCH 11/21] type-check fix

---
 luxonis_train/models/luxonis_lightning.py | 2 +-
 luxonis_train/strategies/base_strategy.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 896c4410..58cdb5a5 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -278,7 +278,7 @@ def __init__(
                 self.cfg.trainer.training_strategy.name
             )(
                 pl_module=self,
-                params=self.cfg.trainer.training_strategy.params,
+                params=self.cfg.trainer.training_strategy.params,  # type: ignore
             )
         else:
             self.training_strategy = None
diff --git a/luxonis_train/strategies/base_strategy.py b/luxonis_train/strategies/base_strategy.py
index caa952e6..8de6386d 100644
--- a/luxonis_train/strategies/base_strategy.py
+++ b/luxonis_train/strategies/base_strategy.py
@@ -1,10 +1,9 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
 
 import pytorch_lightning as pl
 from luxonis_ml.utils.registry import AutoRegisterMeta
 from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import LRScheduler
 
 from luxonis_train.utils.registry import STRATEGIES
 
@@ -19,7 +18,9 @@ def __init__(self, pl_module: pl.LightningModule):
         self.pl_module = pl_module
 
     @abstractmethod
-    def configure_optimizers(self) -> Tuple[_LRScheduler, Optimizer]:
+    def configure_optimizers(
+        self,
+    ) -> tuple[list[Optimizer], list[LRScheduler]]:
         pass
 
     @abstractmethod

From dc1d468dc9a93db47c78fee3aaf156f7732f270d Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Fri, 15 Nov 2024 14:59:45 +0100
Subject: [PATCH 12/21] add docs

---
 configs/README.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/configs/README.md b/configs/README.md
index 8a9e1c01..e8281237 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -376,6 +376,37 @@ trainer:
       eta_min: 0
 ```
 
+### Training Strategy
+
+Defines the training strategy to be used. Currently, only the `TripleLRSGDStrategy` is supported, but more strategies will be added in the future.
+
+| Key               | Type    | Default value           | Description                                    |
+| ----------------- | ------- | ----------------------- | ---------------------------------------------- |
+| `name`            | `str`   | `"TripleLRSGDStrategy"` | Name of the training strategy                  |
+| `warmup_epochs`   | `int`   | `3`                     | Number of epochs for the warmup phase          |
+| `warmup_bias_lr`  | `float` | `0.1`                   | Learning rate for bias during the warmup phase |
+| `warmup_momentum` | `float` | `0.8`                   | Momentum value during the warmup phase         |
+| `lr`              | `float` | `0.02`                  | Initial learning rate                          |
+| `lre`             | `float` | `0.0002`                | End learning rate                              |
+| `momentum`        | `float` | `0.937`                 | Momentum for the optimizer                     |
+| `weight_decay`    | `float` | `0.0005`                | Weight decay value                             |
+| `nesterov`        | `bool`  | `true`                  | Use Nesterov momentum or not                   |
+
+**Example:**
+
+```yaml
+training_strategy:
+  name: "TripleLRSGDStrategy"
+  warmup_epochs: 3
+  warmup_bias_lr: 0.1
+  warmup_momentum: 0.8
+  lr: 0.02
+  lre: 0.0002
+  momentum: 0.937
+  weight_decay: 0.0005
+  nesterov: true
+```
+
 ## Exporter
 
 Here you can define configuration for exporting.

From 9dfd8da115136543a97f50682561f98f6e9ec282 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Sat, 16 Nov 2024 13:29:44 +0100
Subject: [PATCH 13/21] add docs and minor fix

---
 README.md                                 |  2 ++
 luxonis_train/config/config.py            |  4 ++--
 luxonis_train/models/luxonis_lightning.py | 10 +++++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6d718afa..f6f82ed2 100644
--- a/README.md
+++ b/README.md
@@ -567,6 +567,7 @@ model.tune()
 - [**Callbacks**](https://github.com/luxonis/luxonis-train/blob/main/luxonis_train/callbacks/README.md): Allow custom code to be executed at different stages of training.
 - [**Optimizers**](https://github.com/luxonis/luxonis-train/blob/main/configs/README.md#optimizer): Control how the model's weights are updated.
 - [**Schedulers**](https://github.com/luxonis/luxonis-train/blob/main/configs/README.md#scheduler): Adjust the learning rate during training.
+- [**Training Strategy**](https://github.com/luxonis/luxonis-train/blob/main/configs/README.md#training-strategy): Specify a custom combination of optimizer and scheduler to tailor the training process for specific use cases.
 
 **Creating Custom Components:**
 
@@ -581,6 +582,7 @@ Registered components can be referenced in the config file. Custom components ne
 - **Callbacks** - [`lightning.pytorch.callbacks.Callback`](https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html), requires manual registration to the `CALLBACKS` registry
 - **Optimizers** - [`torch.optim.Optimizer`](https://pytorch.org/docs/stable/optim.html#torch.optim.Optimizer), requires manual registration to the `OPTIMIZERS` registry
 - **Schedulers** - [`torch.optim.lr_scheduler.LRScheduler`](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate), requires manual registration to the `SCHEDULERS` registry
+- **Training Strategy** - [`BaseTrainingStrategy`](https://github.com/luxonis/luxonis-train/blob/main/luxonis_train/strategies/base_strategy.py)
 
 **Examples:**
 
diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index 144cbd6b..d233c0eb 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -337,7 +337,7 @@ class SchedulerConfig(BaseModelExtraForbid):
 
 
 class TrainingStrategyConfig(BaseModelExtraForbid):
-    name: str = "TripleLRSGDStrategy"
+    name: str
     params: Params = {}
 
 
@@ -387,7 +387,7 @@ class TrainerConfig(BaseModelExtraForbid):
 
     optimizer: OptimizerConfig = OptimizerConfig()
     scheduler: SchedulerConfig = SchedulerConfig()
-    training_strategy: TrainingStrategyConfig = TrainingStrategyConfig()
+    training_strategy: TrainingStrategyConfig | None = None
 
     @model_validator(mode="after")
     def validate_deterministic(self) -> Self:
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 58cdb5a5..429ee4e3 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -273,7 +273,15 @@ def __init__(
 
         self.load_checkpoint(self.cfg.model.weights)
 
-        if self.cfg.trainer.training_strategy.params:
+        if self.cfg.trainer.training_strategy is not None:
+            if self.cfg.trainer.optimizer is not None:
+                logger.warning(
+                    "Training strategy is active; the specified optimizer will be ignored."
+                )
+            if self.cfg.trainer.scheduler is not None:
+                logger.warning(
+                    "Training strategy is active; the specified scheduler will be ignored."
+                )
             self.training_strategy = STRATEGIES.get(
                 self.cfg.trainer.training_strategy.name
             )(

From 75096dac50a066f1b7f9a06487bfc7ff622b1b4a Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Mon, 18 Nov 2024 13:22:13 +0100
Subject: [PATCH 14/21] fix failing tests

---
 tests/integration/test_simple.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py
index b29e0420..c4024914 100644
--- a/tests/integration/test_simple.py
+++ b/tests/integration/test_simple.py
@@ -69,10 +69,11 @@ def test_predefined_models(
     config_file = f"configs/{config_file}.yaml"
     opts |= {
         "loader.params.dataset_name": (
-            cifar10_dataset.dataset_name
+            cifar10_dataset.identifier
             if "classification" in config_file
-            else coco_dataset.dataset_name
+            else coco_dataset.identifier
         ),
+        "trainer.epochs": 1,
     }
     model = LuxonisModel(config_file, opts)
     model.train()

From 23dda47066ea8e3a33ce9769ad31842c08313e87 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Mon, 18 Nov 2024 16:39:56 +0100
Subject: [PATCH 15/21] modify the logic for assigning the optimizer and
 scheduler

---
 luxonis_train/config/config.py            | 44 ++++++++++++++++-------
 luxonis_train/models/luxonis_lightning.py | 14 ++++----
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index d233c0eb..da7ea603 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -327,12 +327,12 @@ class CallbackConfig(BaseModelExtraForbid):
 
 
 class OptimizerConfig(BaseModelExtraForbid):
-    name: str = "Adam"
+    name: str
     params: Params = {}
 
 
 class SchedulerConfig(BaseModelExtraForbid):
-    name: str = "ConstantLR"
+    name: str
     params: Params = {}
 
 
@@ -385,8 +385,8 @@ class TrainerConfig(BaseModelExtraForbid):
 
     callbacks: list[CallbackConfig] = []
 
-    optimizer: OptimizerConfig = OptimizerConfig()
-    scheduler: SchedulerConfig = SchedulerConfig()
+    optimizer: OptimizerConfig | None = None
+    scheduler: SchedulerConfig | None = None
     training_strategy: TrainingStrategyConfig | None = None
 
     @model_validator(mode="after")
@@ -536,16 +536,34 @@ def smart_auto_populate(cls, instance: "Config") -> None:
         """Automatically populates config fields based on rules, with
         warnings."""
 
+        # Rule: Set default optimizer and scheduler if training_strategy is not defined and optimizer and scheduler are None
+        if instance.trainer.training_strategy is None:
+            if instance.trainer.optimizer is None:
+                instance.trainer.optimizer = OptimizerConfig(
+                    name="Adam", params={}
+                )
+                logger.warning(
+                    "Optimizer not specified. Automatically set to `Adam`."
+                )
+            if instance.trainer.scheduler is None:
+                instance.trainer.scheduler = SchedulerConfig(
+                    name="ConstantLR", params={}
+                )
+                logger.warning(
+                    "Scheduler not specified. Automatically set to `ConstantLR`."
+                )
+
         # Rule: CosineAnnealingLR should have T_max set to the number of epochs if not provided
-        scheduler = instance.trainer.scheduler
-        if (
-            scheduler.name == "CosineAnnealingLR"
-            and "T_max" not in scheduler.params
-        ):
-            scheduler.params["T_max"] = instance.trainer.epochs
-            logger.warning(
-                "`T_max` was not set for `CosineAnnealingLR`. Automatically set `T_max` to number of epochs."
-            )
+        if instance.trainer.scheduler is not None:
+            scheduler = instance.trainer.scheduler
+            if (
+                scheduler.name == "CosineAnnealingLR"
+                and "T_max" not in scheduler.params
+            ):
+                scheduler.params["T_max"] = instance.trainer.epochs
+                logger.warning(
+                    "`T_max` was not set for `CosineAnnealingLR`. Automatically set `T_max` to number of epochs."
+                )
 
         # Rule: Mosaic4 should have out_width and out_height matching train_image_size if not provided
         for augmentation in instance.trainer.preprocessing.augmentations:
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 429ee4e3..57cc6cd5 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -274,13 +274,13 @@ def __init__(
         self.load_checkpoint(self.cfg.model.weights)
 
         if self.cfg.trainer.training_strategy is not None:
-            if self.cfg.trainer.optimizer is not None:
-                logger.warning(
-                    "Training strategy is active; the specified optimizer will be ignored."
-                )
-            if self.cfg.trainer.scheduler is not None:
-                logger.warning(
-                    "Training strategy is active; the specified scheduler will be ignored."
+            if (
+                self.cfg.trainer.optimizer is not None
+                or self.cfg.trainer.scheduler is not None
+            ):
+                raise ValueError(
+                    "Training strategy is defined, but optimizer or scheduler is also defined. "
+                    "Please remove optimizer and scheduler from the config."
                 )
             self.training_strategy = STRATEGIES.get(
                 self.cfg.trainer.training_strategy.name

From 1f16b463de127bcf77b16fad5dbed3910f14be11 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Mon, 18 Nov 2024 16:58:46 +0100
Subject: [PATCH 16/21] fix failing test

---
 tests/integration/test_simple.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py
index c4024914..007338c4 100644
--- a/tests/integration/test_simple.py
+++ b/tests/integration/test_simple.py
@@ -77,7 +77,7 @@ def test_predefined_models(
     }
     model = LuxonisModel(config_file, opts)
     model.train()
-    model.test()
+    model.test(view="train")
 
 
 def test_multi_input(opts: dict[str, Any], infer_path: Path):

From d308e3353179ad1f257959d2f595f34c3e514859 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Tue, 19 Nov 2024 06:04:51 +0100
Subject: [PATCH 17/21] fix type-check error

---
 luxonis_train/models/luxonis_lightning.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 57cc6cd5..08d0066f 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -890,6 +890,11 @@ def configure_optimizers(
         cfg_optimizer = self.cfg.trainer.optimizer
         cfg_scheduler = self.cfg.trainer.scheduler
 
+        if cfg_optimizer is None or cfg_scheduler is None:
+            raise ValueError(
+                "Optimizer and scheduler configuration must not be None."
+            )
+
         optim_params = cfg_optimizer.params | {
             "params": filter(lambda p: p.requires_grad, self.parameters()),
         }

From f5965c704948f2911bd800310c9d89dd726b19c7 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Tue, 19 Nov 2024 07:04:29 +0100
Subject: [PATCH 18/21] type-check fix

---
 tests/integration/test_simple.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py
index 007338c4..e32980f2 100644
--- a/tests/integration/test_simple.py
+++ b/tests/integration/test_simple.py
@@ -281,7 +281,7 @@ def test_smart_cfg_auto_populate(
     }
     model = LuxonisModel(config_file, opts)
     assert (
-        model.cfg.trainer.scheduler.params["T_max"] == model.cfg.trainer.epochs
+        model.cfg.trainer.scheduler.params["T_max"] == model.cfg.trainer.epochs  # type: ignore
     )
     assert (
         model.cfg.trainer.preprocessing.augmentations[0].params["out_width"]

From 093feecf123db7b203e578049338d193e0f9e00e Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Tue, 19 Nov 2024 08:12:03 +0100
Subject: [PATCH 19/21] fix failing tests

---
 luxonis_train/config/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index da7ea603..941cd649 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -517,6 +517,7 @@ def get_config(
     ) -> "Config":
         instance = super().get_config(cfg, overrides)
         if not isinstance(cfg, str):
+            cls.smart_auto_populate(instance)
             return instance
         fs = LuxonisFileSystem(cfg)
         if fs.is_mlflow:

From 4c61f3823dcf97ad918b6bbd83e1003b6a595968 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Tue, 19 Nov 2024 11:59:32 +0100
Subject: [PATCH 20/21] clamp kpts values

---
 .../attached_modules/visualizers/keypoint_visualizer.py     | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py b/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py
index 8c7252ee..f9f4150e 100644
--- a/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py
@@ -57,6 +57,12 @@ def draw_predictions(
             prediction = predictions[i]
             mask = prediction[..., 2] < visibility_threshold
             visible_kpts = prediction[..., :2] * (~mask).unsqueeze(-1).float()
+            visible_kpts[..., 0] = torch.clamp(
+                visible_kpts[..., 0], 0, canvas.size(-1) - 1
+            )
+            visible_kpts[..., 1] = torch.clamp(
+                visible_kpts[..., 1], 0, canvas.size(-2) - 1
+            )
             viz[i] = draw_keypoints(
                 canvas[i].clone(),
                 visible_kpts[..., :2],

From 2951676ad13a288be0057fd2e1344395b3a61765 Mon Sep 17 00:00:00 2001
From: Jernej Sabadin <jernej14sabadin@gmail.com>
Date: Tue, 19 Nov 2024 13:07:37 +0100
Subject: [PATCH 21/21] initialize_weights configurable param

---
 luxonis_train/nodes/README.md                 | 59 ++++++++++---------
 .../backbones/efficientrep/efficientrep.py    |  6 +-
 luxonis_train/nodes/blocks/blocks.py          | 13 ----
 .../nodes/heads/efficient_bbox_head.py        |  6 +-
 .../nodes/necks/reppan_neck/reppan_neck.py    |  6 +-
 5 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/luxonis_train/nodes/README.md b/luxonis_train/nodes/README.md
index cef35029..7c7b53c4 100644
--- a/luxonis_train/nodes/README.md
+++ b/luxonis_train/nodes/README.md
@@ -82,16 +82,17 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 
 **Parameters:**
 
-| Key                | Type                                                              | Default value               | Description                                                                |
-| ------------------ | ----------------------------------------------------------------- | --------------------------- | -------------------------------------------------------------------------- |
-| `variant`          | `Literal["n", "nano", "s", "small", "m", "medium", "l", "large"]` | `"nano"`                    | Variant of the network                                                     |
-| `channels_list`    | `list[int]`                                                       | \[64, 128, 256, 512, 1024\] | List of number of channels for each block                                  |
-| `n_repeats`        | `list[int]`                                                       | \[1, 6, 12, 18, 6\]         | List of number of repeats of `RepVGGBlock`                                 |
-| `depth_mul`        | `float`                                                           | `0.33`                      | Depth multiplier                                                           |
-| `width_mul`        | `float`                                                           | `0.25`                      | Width multiplier                                                           |
-| `block`            | `Literal["RepBlock", "CSPStackRepBlock"]`                         | `"RepBlock"`                | Base block used                                                            |
-| `csp_e`            | `float`                                                           | `0.5`                       | Factor for intermediate channels when block is set to `"CSPStackRepBlock"` |
-| `download_weights` | `bool`                                                            | `True`                      | If True download weights from COCO (if available for specified variant)    |
+| Key                  | Type                                                              | Default value               | Description                                                                |
+| -------------------- | ----------------------------------------------------------------- | --------------------------- | -------------------------------------------------------------------------- |
+| `variant`            | `Literal["n", "nano", "s", "small", "m", "medium", "l", "large"]` | `"nano"`                    | Variant of the network                                                     |
+| `channels_list`      | `list[int]`                                                       | \[64, 128, 256, 512, 1024\] | List of number of channels for each block                                  |
+| `n_repeats`          | `list[int]`                                                       | \[1, 6, 12, 18, 6\]         | List of number of repeats of `RepVGGBlock`                                 |
+| `depth_mul`          | `float`                                                           | `0.33`                      | Depth multiplier                                                           |
+| `width_mul`          | `float`                                                           | `0.25`                      | Width multiplier                                                           |
+| `block`              | `Literal["RepBlock", "CSPStackRepBlock"]`                         | `"RepBlock"`                | Base block used                                                            |
+| `csp_e`              | `float`                                                           | `0.5`                       | Factor for intermediate channels when block is set to `"CSPStackRepBlock"` |
+| `download_weights`   | `bool`                                                            | `True`                      | If True download weights from COCO (if available for specified variant)    |
+| `initialize_weights` | `bool`                                                            | `True`                      | If True, initialize weights.                                               |
 
 ### RexNetV1_lite
 
@@ -175,17 +176,18 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 
 **Parameters:**
 
-| Key                | Type                                                              | Default value                    | Description                                                                     |
-| ------------------ | ----------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------------- |
-| `variant`          | `Literal["n", "nano", "s", "small", "m", "medium", "l", "large"]` | `"nano"`                         | Variant of the network                                                          |
-| `n_heads`          | `Literal[2,3,4]`                                                  | `3`                              | Number of output heads. Should be same also on the connected head in most cases |
-| `channels_list`    | `list[int]`                                                       | `[256, 128, 128, 256, 256, 512]` | List of number of channels for each block                                       |
-| `n_repeats`        | `list[int]`                                                       | `[12, 12, 12, 12]`               | List of number of repeats of `RepVGGBlock`                                      |
-| `depth_mul`        | `float`                                                           | `0.33`                           | Depth multiplier                                                                |
-| `width_mul`        | `float`                                                           | `0.25`                           | Width multiplier                                                                |
-| `block`            | `Literal["RepBlock", "CSPStackRepBlock"]`                         | `"RepBlock"`                     | Base block used                                                                 |
-| `csp_e`            | `float`                                                           | `0.5`                            | Factor for intermediate channels when block is set to `"CSPStackRepBlock"`      |
-| `download_weights` | `bool`                                                            | `False`                          | If True download weights from COCO (if available for specified variant)         |
+| Key                  | Type                                                              | Default value                    | Description                                                                     |
+| -------------------- | ----------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------------- |
+| `variant`            | `Literal["n", "nano", "s", "small", "m", "medium", "l", "large"]` | `"nano"`                         | Variant of the network                                                          |
+| `n_heads`            | `Literal[2,3,4]`                                                  | `3`                              | Number of output heads. Should be same also on the connected head in most cases |
+| `channels_list`      | `list[int]`                                                       | `[256, 128, 128, 256, 256, 512]` | List of number of channels for each block                                       |
+| `n_repeats`          | `list[int]`                                                       | `[12, 12, 12, 12]`               | List of number of repeats of `RepVGGBlock`                                      |
+| `depth_mul`          | `float`                                                           | `0.33`                           | Depth multiplier                                                                |
+| `width_mul`          | `float`                                                           | `0.25`                           | Width multiplier                                                                |
+| `block`              | `Literal["RepBlock", "CSPStackRepBlock"]`                         | `"RepBlock"`                     | Base block used                                                                 |
+| `csp_e`              | `float`                                                           | `0.5`                            | Factor for intermediate channels when block is set to `"CSPStackRepBlock"`      |
+| `download_weights`   | `bool`                                                            | `False`                          | If True download weights from COCO (if available for specified variant)         |
+| `initialize_weights` | `bool`                                                            | `True`                           | If True, initialize weights.                                                    |
 
 ## Heads
 
@@ -217,13 +219,14 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 
 **Parameters:**
 
-| Key                | Type    | Default value | Description                                                           |
-| ------------------ | ------- | ------------- | --------------------------------------------------------------------- |
-| `n_heads`          | `bool`  | `3`           | Number of output heads                                                |
-| `conf_thres`       | `float` | `0.25`        | Confidence threshold for non-maxima-suppression (used for evaluation) |
-| `iou_thres`        | `float` | `0.45`        | `IoU` threshold for non-maxima-suppression (used for evaluation)      |
-| `max_det`          | `int`   | `300`         | Maximum number of detections retained after NMS                       |
-| `download_weights` | `bool`  | `False`       | If True download weights from COCO                                    |
+| Key                  | Type    | Default value | Description                                                           |
+| -------------------- | ------- | ------------- | --------------------------------------------------------------------- |
+| `n_heads`            | `bool`  | `3`           | Number of output heads                                                |
+| `conf_thres`         | `float` | `0.25`        | Confidence threshold for non-maxima-suppression (used for evaluation) |
+| `iou_thres`          | `float` | `0.45`        | `IoU` threshold for non-maxima-suppression (used for evaluation)      |
+| `max_det`            | `int`   | `300`         | Maximum number of detections retained after NMS                       |
+| `download_weights`   | `bool`  | `False`       | If True download weights from COCO                                    |
+| `initialize_weights` | `bool`  | `True`        | If True, initialize weights.                                          |
 
 ### `EfficientKeypointBBoxHead`
 
diff --git a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
index 121ac1bc..340cc10a 100644
--- a/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
+++ b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
@@ -30,6 +30,7 @@ def __init__(
         block: Literal["RepBlock", "CSPStackRepBlock"] | None = None,
         csp_e: float | None = None,
         download_weights: bool = True,
+        initialize_weights: bool = True,
         **kwargs: Any,
     ):
         """Implementation of the EfficientRep backbone. Supports the
@@ -65,6 +66,8 @@ def __init__(
             overrides the variant value.
         @type download_weights: bool
         @param download_weights: If True download weights from COCO (if available for specified variant). Defaults to True.
+        @type initialize_weights: bool
+        @param initialize_weights: If True, initialize weights of the model.
         """
         super().__init__(**kwargs)
 
@@ -125,7 +128,8 @@ def __init__(
             )
         )
 
-        self.initialize_weights()
+        if initialize_weights:
+            self.initialize_weights()
 
         if download_weights and var.weights_path:
             self.load_checkpoint(var.weights_path)
diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py
index 9d63853f..25bea7c5 100644
--- a/luxonis_train/nodes/blocks/blocks.py
+++ b/luxonis_train/nodes/blocks/blocks.py
@@ -56,19 +56,6 @@ def __init__(self, n_classes: int, in_channels: int):
 
         prior_prob = 1e-2
         self._initialize_weights_and_biases(prior_prob)
-        self.initialize_weights()
-
-    def initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                pass
-            elif isinstance(m, nn.BatchNorm2d):
-                m.eps = 0.001
-                m.momentum = 0.03
-            elif isinstance(
-                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU)
-            ):
-                m.inplace = True
 
     def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         out_feature = self.decoder(x)
diff --git a/luxonis_train/nodes/heads/efficient_bbox_head.py b/luxonis_train/nodes/heads/efficient_bbox_head.py
index 0081c6ce..531a294f 100644
--- a/luxonis_train/nodes/heads/efficient_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_bbox_head.py
@@ -30,6 +30,7 @@ def __init__(
         iou_thres: float = 0.45,
         max_det: int = 300,
         download_weights: bool = False,
+        initialize_weights: bool = True,
         **kwargs: Any,
     ):
         """Head for object detection.
@@ -51,6 +52,8 @@ def __init__(
         @type download_weights: bool
         @param download_weights: If True download weights from COCO.
             Defaults to False.
+        @type initialize_weights: bool
+        @param initialize_weights: If True, initialize weights.
         """
         super().__init__(**kwargs)
 
@@ -95,7 +98,8 @@ def __init__(
                 f"output{i+1}_yolov6r2" for i in range(self.n_heads)
             ]
 
-        self.initialize_weights()
+        if initialize_weights:
+            self.initialize_weights()
 
         if download_weights:
             # TODO: Handle variants of head in a nicer way
diff --git a/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py b/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
index 9d02ddcf..2c95890a 100644
--- a/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
+++ b/luxonis_train/nodes/necks/reppan_neck/reppan_neck.py
@@ -27,6 +27,7 @@ def __init__(
         block: Literal["RepBlock", "CSPStackRepBlock"] | None = None,
         csp_e: float | None = None,
         download_weights: bool = False,
+        initialize_weights: bool = True,
         **kwargs: Any,
     ):
         """Implementation of the RepPANNeck module. Supports the version
@@ -65,6 +66,8 @@ def __init__(
             overrides the variant value.
         @type download_weights: bool
         @param download_weights: If True download weights from COCO (if available for specified variant). Defaults to False.
+        @type initialize_weights: bool
+        @param initialize_weights: If True, initialize weights of the model.
         """
 
         super().__init__(**kwargs)
@@ -165,7 +168,8 @@ def __init__(
             out_channels = channels_list_down_blocks[2 * i + 1]
             curr_n_repeats = n_repeats_down_blocks[i]
 
-        self.initialize_weights()
+        if initialize_weights:
+            self.initialize_weights()
 
         if download_weights and var.weights_path:
             self.load_checkpoint(var.weights_path)